donut-base-beans / trainer_state.json
YufeiWeng's picture
End of training
e4ef7e3 verified
raw
history blame
175 kB
{
"best_metric": 0.038467586040496826,
"best_model_checkpoint": "./test_microsoft_dit/checkpoint-7924",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 9905,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005047955577990914,
"grad_norm": 0.8398004174232483,
"learning_rate": 2.9969712266532054e-05,
"loss": 0.3087,
"step": 10
},
{
"epoch": 0.010095911155981827,
"grad_norm": 1.147126317024231,
"learning_rate": 2.993942453306411e-05,
"loss": 0.202,
"step": 20
},
{
"epoch": 0.01514386673397274,
"grad_norm": 1.1376692056655884,
"learning_rate": 2.9909136799596164e-05,
"loss": 0.1375,
"step": 30
},
{
"epoch": 0.020191822311963654,
"grad_norm": 3.0222654342651367,
"learning_rate": 2.987884906612822e-05,
"loss": 0.1254,
"step": 40
},
{
"epoch": 0.02523977788995457,
"grad_norm": 1.3963178396224976,
"learning_rate": 2.9848561332660275e-05,
"loss": 0.1105,
"step": 50
},
{
"epoch": 0.03028773346794548,
"grad_norm": 0.741131067276001,
"learning_rate": 2.9818273599192328e-05,
"loss": 0.1022,
"step": 60
},
{
"epoch": 0.0353356890459364,
"grad_norm": 1.0705397129058838,
"learning_rate": 2.978798586572438e-05,
"loss": 0.1027,
"step": 70
},
{
"epoch": 0.04038364462392731,
"grad_norm": 1.127729892730713,
"learning_rate": 2.9757698132256435e-05,
"loss": 0.0979,
"step": 80
},
{
"epoch": 0.04543160020191822,
"grad_norm": 0.888960063457489,
"learning_rate": 2.9727410398788492e-05,
"loss": 0.1024,
"step": 90
},
{
"epoch": 0.05047955577990914,
"grad_norm": 0.9185839295387268,
"learning_rate": 2.9697122665320545e-05,
"loss": 0.1142,
"step": 100
},
{
"epoch": 0.05552751135790005,
"grad_norm": 0.737047016620636,
"learning_rate": 2.96668349318526e-05,
"loss": 0.0956,
"step": 110
},
{
"epoch": 0.06057546693589096,
"grad_norm": 0.7749747037887573,
"learning_rate": 2.9636547198384656e-05,
"loss": 0.0978,
"step": 120
},
{
"epoch": 0.06562342251388188,
"grad_norm": 1.079695224761963,
"learning_rate": 2.960625946491671e-05,
"loss": 0.092,
"step": 130
},
{
"epoch": 0.0706713780918728,
"grad_norm": 0.8315634727478027,
"learning_rate": 2.9575971731448766e-05,
"loss": 0.0975,
"step": 140
},
{
"epoch": 0.0757193336698637,
"grad_norm": 0.7270865440368652,
"learning_rate": 2.954568399798082e-05,
"loss": 0.098,
"step": 150
},
{
"epoch": 0.08076728924785462,
"grad_norm": 0.5786823630332947,
"learning_rate": 2.9515396264512873e-05,
"loss": 0.0846,
"step": 160
},
{
"epoch": 0.08581524482584553,
"grad_norm": 0.7117003798484802,
"learning_rate": 2.948510853104493e-05,
"loss": 0.0905,
"step": 170
},
{
"epoch": 0.09086320040383644,
"grad_norm": 0.6765159368515015,
"learning_rate": 2.9454820797576983e-05,
"loss": 0.0764,
"step": 180
},
{
"epoch": 0.09591115598182735,
"grad_norm": 1.1397738456726074,
"learning_rate": 2.9424533064109037e-05,
"loss": 0.0882,
"step": 190
},
{
"epoch": 0.10095911155981828,
"grad_norm": 0.6545870900154114,
"learning_rate": 2.939424533064109e-05,
"loss": 0.0991,
"step": 200
},
{
"epoch": 0.10600706713780919,
"grad_norm": 0.8882391452789307,
"learning_rate": 2.9363957597173144e-05,
"loss": 0.0902,
"step": 210
},
{
"epoch": 0.1110550227158001,
"grad_norm": 0.5973140001296997,
"learning_rate": 2.93336698637052e-05,
"loss": 0.0968,
"step": 220
},
{
"epoch": 0.11610297829379101,
"grad_norm": 1.3215384483337402,
"learning_rate": 2.9303382130237254e-05,
"loss": 0.0901,
"step": 230
},
{
"epoch": 0.12115093387178193,
"grad_norm": 0.6139042973518372,
"learning_rate": 2.9273094396769307e-05,
"loss": 0.0739,
"step": 240
},
{
"epoch": 0.12619888944977284,
"grad_norm": 0.9095037579536438,
"learning_rate": 2.9242806663301364e-05,
"loss": 0.0907,
"step": 250
},
{
"epoch": 0.13124684502776376,
"grad_norm": 1.0266954898834229,
"learning_rate": 2.9212518929833418e-05,
"loss": 0.0726,
"step": 260
},
{
"epoch": 0.13629480060575466,
"grad_norm": 0.734716534614563,
"learning_rate": 2.9182231196365474e-05,
"loss": 0.0891,
"step": 270
},
{
"epoch": 0.1413427561837456,
"grad_norm": 0.7633081674575806,
"learning_rate": 2.9151943462897528e-05,
"loss": 0.0747,
"step": 280
},
{
"epoch": 0.1463907117617365,
"grad_norm": 0.8185615539550781,
"learning_rate": 2.912165572942958e-05,
"loss": 0.0815,
"step": 290
},
{
"epoch": 0.1514386673397274,
"grad_norm": 1.2503191232681274,
"learning_rate": 2.9091367995961638e-05,
"loss": 0.0844,
"step": 300
},
{
"epoch": 0.15648662291771834,
"grad_norm": 0.52531898021698,
"learning_rate": 2.906108026249369e-05,
"loss": 0.0863,
"step": 310
},
{
"epoch": 0.16153457849570924,
"grad_norm": 0.8883135914802551,
"learning_rate": 2.9030792529025745e-05,
"loss": 0.0833,
"step": 320
},
{
"epoch": 0.16658253407370016,
"grad_norm": 0.5173369646072388,
"learning_rate": 2.90005047955578e-05,
"loss": 0.0882,
"step": 330
},
{
"epoch": 0.17163048965169106,
"grad_norm": 0.5770648717880249,
"learning_rate": 2.8970217062089852e-05,
"loss": 0.0814,
"step": 340
},
{
"epoch": 0.17667844522968199,
"grad_norm": 0.8828192949295044,
"learning_rate": 2.893992932862191e-05,
"loss": 0.0776,
"step": 350
},
{
"epoch": 0.18172640080767288,
"grad_norm": 0.756236732006073,
"learning_rate": 2.8909641595153962e-05,
"loss": 0.0736,
"step": 360
},
{
"epoch": 0.1867743563856638,
"grad_norm": 0.47730007767677307,
"learning_rate": 2.887935386168602e-05,
"loss": 0.0856,
"step": 370
},
{
"epoch": 0.1918223119636547,
"grad_norm": 2.5338025093078613,
"learning_rate": 2.8849066128218072e-05,
"loss": 0.0879,
"step": 380
},
{
"epoch": 0.19687026754164563,
"grad_norm": 0.6218165159225464,
"learning_rate": 2.8818778394750126e-05,
"loss": 0.0724,
"step": 390
},
{
"epoch": 0.20191822311963656,
"grad_norm": 1.1621041297912598,
"learning_rate": 2.8788490661282183e-05,
"loss": 0.0742,
"step": 400
},
{
"epoch": 0.20696617869762746,
"grad_norm": 0.8511998653411865,
"learning_rate": 2.8758202927814236e-05,
"loss": 0.0798,
"step": 410
},
{
"epoch": 0.21201413427561838,
"grad_norm": 0.5848472118377686,
"learning_rate": 2.8727915194346293e-05,
"loss": 0.0834,
"step": 420
},
{
"epoch": 0.21706208985360928,
"grad_norm": 0.5747645497322083,
"learning_rate": 2.8697627460878346e-05,
"loss": 0.0745,
"step": 430
},
{
"epoch": 0.2221100454316002,
"grad_norm": 1.058206558227539,
"learning_rate": 2.86673397274104e-05,
"loss": 0.0767,
"step": 440
},
{
"epoch": 0.2271580010095911,
"grad_norm": 0.8267918825149536,
"learning_rate": 2.8637051993942453e-05,
"loss": 0.0893,
"step": 450
},
{
"epoch": 0.23220595658758203,
"grad_norm": 1.1392240524291992,
"learning_rate": 2.8606764260474507e-05,
"loss": 0.0833,
"step": 460
},
{
"epoch": 0.23725391216557296,
"grad_norm": 0.9474436044692993,
"learning_rate": 2.8576476527006564e-05,
"loss": 0.0896,
"step": 470
},
{
"epoch": 0.24230186774356385,
"grad_norm": 1.2880048751831055,
"learning_rate": 2.8546188793538617e-05,
"loss": 0.0924,
"step": 480
},
{
"epoch": 0.24734982332155478,
"grad_norm": 0.6342403888702393,
"learning_rate": 2.851590106007067e-05,
"loss": 0.0799,
"step": 490
},
{
"epoch": 0.2523977788995457,
"grad_norm": 0.5780256986618042,
"learning_rate": 2.8485613326602727e-05,
"loss": 0.0798,
"step": 500
},
{
"epoch": 0.2574457344775366,
"grad_norm": 0.7743504643440247,
"learning_rate": 2.845532559313478e-05,
"loss": 0.0681,
"step": 510
},
{
"epoch": 0.26249369005552753,
"grad_norm": 0.5771861672401428,
"learning_rate": 2.8425037859666834e-05,
"loss": 0.0753,
"step": 520
},
{
"epoch": 0.2675416456335184,
"grad_norm": 0.6735575199127197,
"learning_rate": 2.839475012619889e-05,
"loss": 0.0773,
"step": 530
},
{
"epoch": 0.2725896012115093,
"grad_norm": 0.7692667841911316,
"learning_rate": 2.8364462392730945e-05,
"loss": 0.0732,
"step": 540
},
{
"epoch": 0.27763755678950025,
"grad_norm": 0.5109196901321411,
"learning_rate": 2.8334174659263e-05,
"loss": 0.0859,
"step": 550
},
{
"epoch": 0.2826855123674912,
"grad_norm": 0.726249098777771,
"learning_rate": 2.8303886925795055e-05,
"loss": 0.0801,
"step": 560
},
{
"epoch": 0.2877334679454821,
"grad_norm": 0.8817322254180908,
"learning_rate": 2.8273599192327108e-05,
"loss": 0.0739,
"step": 570
},
{
"epoch": 0.292781423523473,
"grad_norm": 0.5081413984298706,
"learning_rate": 2.8243311458859162e-05,
"loss": 0.0727,
"step": 580
},
{
"epoch": 0.2978293791014639,
"grad_norm": 0.9367203712463379,
"learning_rate": 2.8213023725391215e-05,
"loss": 0.0751,
"step": 590
},
{
"epoch": 0.3028773346794548,
"grad_norm": 0.5382592678070068,
"learning_rate": 2.8182735991923272e-05,
"loss": 0.0756,
"step": 600
},
{
"epoch": 0.30792529025744575,
"grad_norm": 0.40977007150650024,
"learning_rate": 2.8152448258455325e-05,
"loss": 0.0714,
"step": 610
},
{
"epoch": 0.3129732458354367,
"grad_norm": 0.6829769015312195,
"learning_rate": 2.812216052498738e-05,
"loss": 0.0809,
"step": 620
},
{
"epoch": 0.31802120141342755,
"grad_norm": 0.4805002212524414,
"learning_rate": 2.8091872791519436e-05,
"loss": 0.0789,
"step": 630
},
{
"epoch": 0.32306915699141847,
"grad_norm": 0.6755364537239075,
"learning_rate": 2.806158505805149e-05,
"loss": 0.0819,
"step": 640
},
{
"epoch": 0.3281171125694094,
"grad_norm": 1.3035857677459717,
"learning_rate": 2.8031297324583546e-05,
"loss": 0.0861,
"step": 650
},
{
"epoch": 0.3331650681474003,
"grad_norm": 0.7905831933021545,
"learning_rate": 2.80010095911156e-05,
"loss": 0.0739,
"step": 660
},
{
"epoch": 0.3382130237253912,
"grad_norm": 0.8810652494430542,
"learning_rate": 2.7970721857647653e-05,
"loss": 0.0678,
"step": 670
},
{
"epoch": 0.3432609793033821,
"grad_norm": 1.1220252513885498,
"learning_rate": 2.794043412417971e-05,
"loss": 0.07,
"step": 680
},
{
"epoch": 0.34830893488137304,
"grad_norm": 0.8519473075866699,
"learning_rate": 2.7910146390711763e-05,
"loss": 0.076,
"step": 690
},
{
"epoch": 0.35335689045936397,
"grad_norm": 0.49878937005996704,
"learning_rate": 2.787985865724382e-05,
"loss": 0.0787,
"step": 700
},
{
"epoch": 0.3584048460373549,
"grad_norm": 1.4854084253311157,
"learning_rate": 2.784957092377587e-05,
"loss": 0.0872,
"step": 710
},
{
"epoch": 0.36345280161534577,
"grad_norm": 0.787535548210144,
"learning_rate": 2.7819283190307924e-05,
"loss": 0.0805,
"step": 720
},
{
"epoch": 0.3685007571933367,
"grad_norm": 0.8322392106056213,
"learning_rate": 2.778899545683998e-05,
"loss": 0.0726,
"step": 730
},
{
"epoch": 0.3735487127713276,
"grad_norm": 0.48470157384872437,
"learning_rate": 2.7758707723372034e-05,
"loss": 0.0673,
"step": 740
},
{
"epoch": 0.37859666834931854,
"grad_norm": 0.8375622034072876,
"learning_rate": 2.772841998990409e-05,
"loss": 0.0767,
"step": 750
},
{
"epoch": 0.3836446239273094,
"grad_norm": 0.5212222337722778,
"learning_rate": 2.7698132256436144e-05,
"loss": 0.0737,
"step": 760
},
{
"epoch": 0.38869257950530034,
"grad_norm": 0.503209114074707,
"learning_rate": 2.7667844522968198e-05,
"loss": 0.0657,
"step": 770
},
{
"epoch": 0.39374053508329127,
"grad_norm": 0.4290629029273987,
"learning_rate": 2.7637556789500254e-05,
"loss": 0.0745,
"step": 780
},
{
"epoch": 0.3987884906612822,
"grad_norm": 0.7535534501075745,
"learning_rate": 2.7607269056032308e-05,
"loss": 0.0702,
"step": 790
},
{
"epoch": 0.4038364462392731,
"grad_norm": 0.67135089635849,
"learning_rate": 2.757698132256436e-05,
"loss": 0.0754,
"step": 800
},
{
"epoch": 0.408884401817264,
"grad_norm": 0.5307912230491638,
"learning_rate": 2.7546693589096418e-05,
"loss": 0.0717,
"step": 810
},
{
"epoch": 0.4139323573952549,
"grad_norm": 0.46130767464637756,
"learning_rate": 2.751640585562847e-05,
"loss": 0.065,
"step": 820
},
{
"epoch": 0.41898031297324584,
"grad_norm": 1.2904905080795288,
"learning_rate": 2.748611812216053e-05,
"loss": 0.0818,
"step": 830
},
{
"epoch": 0.42402826855123676,
"grad_norm": 2.0480494499206543,
"learning_rate": 2.745583038869258e-05,
"loss": 0.085,
"step": 840
},
{
"epoch": 0.4290762241292277,
"grad_norm": 0.5108308792114258,
"learning_rate": 2.7425542655224632e-05,
"loss": 0.0729,
"step": 850
},
{
"epoch": 0.43412417970721856,
"grad_norm": 0.6915296912193298,
"learning_rate": 2.739525492175669e-05,
"loss": 0.071,
"step": 860
},
{
"epoch": 0.4391721352852095,
"grad_norm": 0.8100910782814026,
"learning_rate": 2.7364967188288742e-05,
"loss": 0.0667,
"step": 870
},
{
"epoch": 0.4442200908632004,
"grad_norm": 0.626818835735321,
"learning_rate": 2.73346794548208e-05,
"loss": 0.0695,
"step": 880
},
{
"epoch": 0.44926804644119134,
"grad_norm": 0.673156201839447,
"learning_rate": 2.7304391721352853e-05,
"loss": 0.0793,
"step": 890
},
{
"epoch": 0.4543160020191822,
"grad_norm": 0.5740798711776733,
"learning_rate": 2.7274103987884906e-05,
"loss": 0.0731,
"step": 900
},
{
"epoch": 0.45936395759717313,
"grad_norm": 0.744429349899292,
"learning_rate": 2.7243816254416963e-05,
"loss": 0.0743,
"step": 910
},
{
"epoch": 0.46441191317516406,
"grad_norm": 0.5837222933769226,
"learning_rate": 2.7213528520949016e-05,
"loss": 0.0747,
"step": 920
},
{
"epoch": 0.469459868753155,
"grad_norm": 0.500978410243988,
"learning_rate": 2.7183240787481073e-05,
"loss": 0.0753,
"step": 930
},
{
"epoch": 0.4745078243311459,
"grad_norm": 1.0817604064941406,
"learning_rate": 2.7152953054013127e-05,
"loss": 0.0748,
"step": 940
},
{
"epoch": 0.4795557799091368,
"grad_norm": 0.5821205377578735,
"learning_rate": 2.712266532054518e-05,
"loss": 0.0766,
"step": 950
},
{
"epoch": 0.4846037354871277,
"grad_norm": 0.6120801568031311,
"learning_rate": 2.7092377587077233e-05,
"loss": 0.0827,
"step": 960
},
{
"epoch": 0.48965169106511863,
"grad_norm": 0.4379239082336426,
"learning_rate": 2.7062089853609287e-05,
"loss": 0.0664,
"step": 970
},
{
"epoch": 0.49469964664310956,
"grad_norm": 0.5472243428230286,
"learning_rate": 2.7031802120141344e-05,
"loss": 0.0767,
"step": 980
},
{
"epoch": 0.49974760222110043,
"grad_norm": 1.0190905332565308,
"learning_rate": 2.7001514386673397e-05,
"loss": 0.0739,
"step": 990
},
{
"epoch": 0.5047955577990914,
"grad_norm": 0.7046610713005066,
"learning_rate": 2.697122665320545e-05,
"loss": 0.0685,
"step": 1000
},
{
"epoch": 0.5098435133770823,
"grad_norm": 0.5559498071670532,
"learning_rate": 2.6940938919737507e-05,
"loss": 0.0715,
"step": 1010
},
{
"epoch": 0.5148914689550732,
"grad_norm": 0.6298381686210632,
"learning_rate": 2.691065118626956e-05,
"loss": 0.0828,
"step": 1020
},
{
"epoch": 0.5199394245330641,
"grad_norm": 0.7023555636405945,
"learning_rate": 2.6880363452801618e-05,
"loss": 0.0809,
"step": 1030
},
{
"epoch": 0.5249873801110551,
"grad_norm": 0.6804683804512024,
"learning_rate": 2.685007571933367e-05,
"loss": 0.0739,
"step": 1040
},
{
"epoch": 0.5300353356890459,
"grad_norm": 0.7743015885353088,
"learning_rate": 2.6819787985865725e-05,
"loss": 0.0658,
"step": 1050
},
{
"epoch": 0.5350832912670368,
"grad_norm": 1.36810302734375,
"learning_rate": 2.678950025239778e-05,
"loss": 0.0747,
"step": 1060
},
{
"epoch": 0.5401312468450278,
"grad_norm": 0.47373896837234497,
"learning_rate": 2.6759212518929835e-05,
"loss": 0.0751,
"step": 1070
},
{
"epoch": 0.5451792024230186,
"grad_norm": 0.6654021143913269,
"learning_rate": 2.6728924785461892e-05,
"loss": 0.0683,
"step": 1080
},
{
"epoch": 0.5502271580010096,
"grad_norm": 1.0054854154586792,
"learning_rate": 2.6698637051993942e-05,
"loss": 0.0676,
"step": 1090
},
{
"epoch": 0.5552751135790005,
"grad_norm": 0.5544041395187378,
"learning_rate": 2.6668349318525995e-05,
"loss": 0.075,
"step": 1100
},
{
"epoch": 0.5603230691569914,
"grad_norm": 0.6919006109237671,
"learning_rate": 2.6638061585058052e-05,
"loss": 0.0709,
"step": 1110
},
{
"epoch": 0.5653710247349824,
"grad_norm": 0.5584747791290283,
"learning_rate": 2.6607773851590106e-05,
"loss": 0.0623,
"step": 1120
},
{
"epoch": 0.5704189803129732,
"grad_norm": 0.47064319252967834,
"learning_rate": 2.657748611812216e-05,
"loss": 0.0744,
"step": 1130
},
{
"epoch": 0.5754669358909642,
"grad_norm": 0.5119986534118652,
"learning_rate": 2.6547198384654216e-05,
"loss": 0.0795,
"step": 1140
},
{
"epoch": 0.5805148914689551,
"grad_norm": 0.9572923183441162,
"learning_rate": 2.651691065118627e-05,
"loss": 0.073,
"step": 1150
},
{
"epoch": 0.585562847046946,
"grad_norm": 0.5633489489555359,
"learning_rate": 2.6486622917718326e-05,
"loss": 0.0637,
"step": 1160
},
{
"epoch": 0.5906108026249369,
"grad_norm": 1.1218105554580688,
"learning_rate": 2.645633518425038e-05,
"loss": 0.0695,
"step": 1170
},
{
"epoch": 0.5956587582029278,
"grad_norm": 0.6655285954475403,
"learning_rate": 2.6426047450782433e-05,
"loss": 0.0774,
"step": 1180
},
{
"epoch": 0.6007067137809188,
"grad_norm": 1.3088024854660034,
"learning_rate": 2.639575971731449e-05,
"loss": 0.0748,
"step": 1190
},
{
"epoch": 0.6057546693589096,
"grad_norm": 0.9868513941764832,
"learning_rate": 2.6365471983846543e-05,
"loss": 0.0695,
"step": 1200
},
{
"epoch": 0.6108026249369005,
"grad_norm": 0.5922626852989197,
"learning_rate": 2.63351842503786e-05,
"loss": 0.0678,
"step": 1210
},
{
"epoch": 0.6158505805148915,
"grad_norm": 0.6839954257011414,
"learning_rate": 2.630489651691065e-05,
"loss": 0.0693,
"step": 1220
},
{
"epoch": 0.6208985360928824,
"grad_norm": 0.6755519509315491,
"learning_rate": 2.6274608783442704e-05,
"loss": 0.0742,
"step": 1230
},
{
"epoch": 0.6259464916708734,
"grad_norm": 0.4968509078025818,
"learning_rate": 2.624432104997476e-05,
"loss": 0.0615,
"step": 1240
},
{
"epoch": 0.6309944472488642,
"grad_norm": 1.1036404371261597,
"learning_rate": 2.6214033316506814e-05,
"loss": 0.0727,
"step": 1250
},
{
"epoch": 0.6360424028268551,
"grad_norm": 0.810405969619751,
"learning_rate": 2.618374558303887e-05,
"loss": 0.072,
"step": 1260
},
{
"epoch": 0.6410903584048461,
"grad_norm": 0.730140209197998,
"learning_rate": 2.6153457849570924e-05,
"loss": 0.0652,
"step": 1270
},
{
"epoch": 0.6461383139828369,
"grad_norm": 1.1645480394363403,
"learning_rate": 2.6123170116102978e-05,
"loss": 0.0716,
"step": 1280
},
{
"epoch": 0.6511862695608278,
"grad_norm": 0.8481037020683289,
"learning_rate": 2.6092882382635034e-05,
"loss": 0.0737,
"step": 1290
},
{
"epoch": 0.6562342251388188,
"grad_norm": 0.5972946882247925,
"learning_rate": 2.6062594649167088e-05,
"loss": 0.0704,
"step": 1300
},
{
"epoch": 0.6612821807168097,
"grad_norm": 0.6405556201934814,
"learning_rate": 2.6032306915699145e-05,
"loss": 0.0628,
"step": 1310
},
{
"epoch": 0.6663301362948006,
"grad_norm": 0.8645715117454529,
"learning_rate": 2.6002019182231198e-05,
"loss": 0.0742,
"step": 1320
},
{
"epoch": 0.6713780918727915,
"grad_norm": 1.4211089611053467,
"learning_rate": 2.597173144876325e-05,
"loss": 0.0731,
"step": 1330
},
{
"epoch": 0.6764260474507824,
"grad_norm": 0.8079481720924377,
"learning_rate": 2.594144371529531e-05,
"loss": 0.0732,
"step": 1340
},
{
"epoch": 0.6814740030287734,
"grad_norm": 0.6517273783683777,
"learning_rate": 2.591115598182736e-05,
"loss": 0.0688,
"step": 1350
},
{
"epoch": 0.6865219586067642,
"grad_norm": 1.2093323469161987,
"learning_rate": 2.5880868248359415e-05,
"loss": 0.0729,
"step": 1360
},
{
"epoch": 0.6915699141847552,
"grad_norm": 0.6432307362556458,
"learning_rate": 2.585058051489147e-05,
"loss": 0.076,
"step": 1370
},
{
"epoch": 0.6966178697627461,
"grad_norm": 0.5220794677734375,
"learning_rate": 2.5820292781423522e-05,
"loss": 0.0702,
"step": 1380
},
{
"epoch": 0.701665825340737,
"grad_norm": 1.0983613729476929,
"learning_rate": 2.579000504795558e-05,
"loss": 0.0676,
"step": 1390
},
{
"epoch": 0.7067137809187279,
"grad_norm": 0.859348475933075,
"learning_rate": 2.5759717314487633e-05,
"loss": 0.0615,
"step": 1400
},
{
"epoch": 0.7117617364967188,
"grad_norm": 0.7912864685058594,
"learning_rate": 2.572942958101969e-05,
"loss": 0.0681,
"step": 1410
},
{
"epoch": 0.7168096920747098,
"grad_norm": 0.6189167499542236,
"learning_rate": 2.5699141847551743e-05,
"loss": 0.0682,
"step": 1420
},
{
"epoch": 0.7218576476527007,
"grad_norm": 0.5456287860870361,
"learning_rate": 2.5668854114083796e-05,
"loss": 0.0591,
"step": 1430
},
{
"epoch": 0.7269056032306915,
"grad_norm": 0.485055148601532,
"learning_rate": 2.5638566380615853e-05,
"loss": 0.0729,
"step": 1440
},
{
"epoch": 0.7319535588086825,
"grad_norm": 0.46423906087875366,
"learning_rate": 2.5608278647147907e-05,
"loss": 0.0646,
"step": 1450
},
{
"epoch": 0.7370015143866734,
"grad_norm": 0.5944865345954895,
"learning_rate": 2.557799091367996e-05,
"loss": 0.0696,
"step": 1460
},
{
"epoch": 0.7420494699646644,
"grad_norm": 0.794015645980835,
"learning_rate": 2.5547703180212014e-05,
"loss": 0.0671,
"step": 1470
},
{
"epoch": 0.7470974255426552,
"grad_norm": 0.6759900450706482,
"learning_rate": 2.5517415446744067e-05,
"loss": 0.074,
"step": 1480
},
{
"epoch": 0.7521453811206461,
"grad_norm": 0.6719480156898499,
"learning_rate": 2.5487127713276124e-05,
"loss": 0.0708,
"step": 1490
},
{
"epoch": 0.7571933366986371,
"grad_norm": 0.7934426665306091,
"learning_rate": 2.5456839979808177e-05,
"loss": 0.0664,
"step": 1500
},
{
"epoch": 0.762241292276628,
"grad_norm": 1.4169378280639648,
"learning_rate": 2.542655224634023e-05,
"loss": 0.0726,
"step": 1510
},
{
"epoch": 0.7672892478546188,
"grad_norm": 0.5849716067314148,
"learning_rate": 2.5396264512872288e-05,
"loss": 0.0709,
"step": 1520
},
{
"epoch": 0.7723372034326098,
"grad_norm": 0.8471559286117554,
"learning_rate": 2.536597677940434e-05,
"loss": 0.0764,
"step": 1530
},
{
"epoch": 0.7773851590106007,
"grad_norm": 0.7494149804115295,
"learning_rate": 2.5335689045936398e-05,
"loss": 0.0629,
"step": 1540
},
{
"epoch": 0.7824331145885917,
"grad_norm": 0.7659397721290588,
"learning_rate": 2.530540131246845e-05,
"loss": 0.061,
"step": 1550
},
{
"epoch": 0.7874810701665825,
"grad_norm": 0.8505954146385193,
"learning_rate": 2.5275113579000505e-05,
"loss": 0.0693,
"step": 1560
},
{
"epoch": 0.7925290257445734,
"grad_norm": 0.8126624226570129,
"learning_rate": 2.524482584553256e-05,
"loss": 0.0738,
"step": 1570
},
{
"epoch": 0.7975769813225644,
"grad_norm": 0.9350792765617371,
"learning_rate": 2.5214538112064615e-05,
"loss": 0.0821,
"step": 1580
},
{
"epoch": 0.8026249369005553,
"grad_norm": 1.075035810470581,
"learning_rate": 2.5184250378596672e-05,
"loss": 0.0758,
"step": 1590
},
{
"epoch": 0.8076728924785462,
"grad_norm": 0.6885321736335754,
"learning_rate": 2.5153962645128722e-05,
"loss": 0.0641,
"step": 1600
},
{
"epoch": 0.8127208480565371,
"grad_norm": 0.7702226042747498,
"learning_rate": 2.5123674911660775e-05,
"loss": 0.0642,
"step": 1610
},
{
"epoch": 0.817768803634528,
"grad_norm": 0.9809953570365906,
"learning_rate": 2.5093387178192832e-05,
"loss": 0.0759,
"step": 1620
},
{
"epoch": 0.822816759212519,
"grad_norm": 0.5996444225311279,
"learning_rate": 2.5063099444724886e-05,
"loss": 0.0686,
"step": 1630
},
{
"epoch": 0.8278647147905098,
"grad_norm": 0.5003983378410339,
"learning_rate": 2.5032811711256942e-05,
"loss": 0.0697,
"step": 1640
},
{
"epoch": 0.8329126703685008,
"grad_norm": 0.7024896740913391,
"learning_rate": 2.5002523977788996e-05,
"loss": 0.0699,
"step": 1650
},
{
"epoch": 0.8379606259464917,
"grad_norm": 0.5384397506713867,
"learning_rate": 2.497223624432105e-05,
"loss": 0.0684,
"step": 1660
},
{
"epoch": 0.8430085815244825,
"grad_norm": 1.176849126815796,
"learning_rate": 2.4941948510853106e-05,
"loss": 0.065,
"step": 1670
},
{
"epoch": 0.8480565371024735,
"grad_norm": 0.7623859643936157,
"learning_rate": 2.491166077738516e-05,
"loss": 0.0676,
"step": 1680
},
{
"epoch": 0.8531044926804644,
"grad_norm": 0.8817411065101624,
"learning_rate": 2.4881373043917216e-05,
"loss": 0.0712,
"step": 1690
},
{
"epoch": 0.8581524482584554,
"grad_norm": 0.7471240162849426,
"learning_rate": 2.485108531044927e-05,
"loss": 0.0719,
"step": 1700
},
{
"epoch": 0.8632004038364463,
"grad_norm": 0.9217013120651245,
"learning_rate": 2.4820797576981323e-05,
"loss": 0.0758,
"step": 1710
},
{
"epoch": 0.8682483594144371,
"grad_norm": 0.4985320568084717,
"learning_rate": 2.479050984351338e-05,
"loss": 0.075,
"step": 1720
},
{
"epoch": 0.8732963149924281,
"grad_norm": 0.47823965549468994,
"learning_rate": 2.476022211004543e-05,
"loss": 0.0576,
"step": 1730
},
{
"epoch": 0.878344270570419,
"grad_norm": 0.5073914527893066,
"learning_rate": 2.4729934376577487e-05,
"loss": 0.0619,
"step": 1740
},
{
"epoch": 0.8833922261484098,
"grad_norm": 0.6744971871376038,
"learning_rate": 2.469964664310954e-05,
"loss": 0.0674,
"step": 1750
},
{
"epoch": 0.8884401817264008,
"grad_norm": 0.7287705540657043,
"learning_rate": 2.4669358909641594e-05,
"loss": 0.0705,
"step": 1760
},
{
"epoch": 0.8934881373043917,
"grad_norm": 0.6387834548950195,
"learning_rate": 2.463907117617365e-05,
"loss": 0.0736,
"step": 1770
},
{
"epoch": 0.8985360928823827,
"grad_norm": 0.8428398370742798,
"learning_rate": 2.4608783442705704e-05,
"loss": 0.0741,
"step": 1780
},
{
"epoch": 0.9035840484603735,
"grad_norm": 0.6455987691879272,
"learning_rate": 2.4578495709237758e-05,
"loss": 0.0639,
"step": 1790
},
{
"epoch": 0.9086320040383644,
"grad_norm": 0.6735292673110962,
"learning_rate": 2.4548207975769815e-05,
"loss": 0.0795,
"step": 1800
},
{
"epoch": 0.9136799596163554,
"grad_norm": 0.6157563924789429,
"learning_rate": 2.4517920242301868e-05,
"loss": 0.0699,
"step": 1810
},
{
"epoch": 0.9187279151943463,
"grad_norm": 0.7483514547348022,
"learning_rate": 2.4487632508833925e-05,
"loss": 0.0681,
"step": 1820
},
{
"epoch": 0.9237758707723372,
"grad_norm": 0.5686767101287842,
"learning_rate": 2.4457344775365978e-05,
"loss": 0.0713,
"step": 1830
},
{
"epoch": 0.9288238263503281,
"grad_norm": 0.352909654378891,
"learning_rate": 2.4427057041898032e-05,
"loss": 0.0641,
"step": 1840
},
{
"epoch": 0.933871781928319,
"grad_norm": 0.6095912456512451,
"learning_rate": 2.439676930843009e-05,
"loss": 0.0794,
"step": 1850
},
{
"epoch": 0.93891973750631,
"grad_norm": 0.3929665684700012,
"learning_rate": 2.436648157496214e-05,
"loss": 0.0672,
"step": 1860
},
{
"epoch": 0.9439676930843008,
"grad_norm": 0.22026501595973969,
"learning_rate": 2.4336193841494195e-05,
"loss": 0.0699,
"step": 1870
},
{
"epoch": 0.9490156486622918,
"grad_norm": 0.5952547788619995,
"learning_rate": 2.430590610802625e-05,
"loss": 0.0733,
"step": 1880
},
{
"epoch": 0.9540636042402827,
"grad_norm": 0.7297592163085938,
"learning_rate": 2.4275618374558302e-05,
"loss": 0.0725,
"step": 1890
},
{
"epoch": 0.9591115598182736,
"grad_norm": 0.35177797079086304,
"learning_rate": 2.424533064109036e-05,
"loss": 0.0651,
"step": 1900
},
{
"epoch": 0.9641595153962645,
"grad_norm": 0.6706666350364685,
"learning_rate": 2.4215042907622413e-05,
"loss": 0.0737,
"step": 1910
},
{
"epoch": 0.9692074709742554,
"grad_norm": 0.7155650854110718,
"learning_rate": 2.418475517415447e-05,
"loss": 0.074,
"step": 1920
},
{
"epoch": 0.9742554265522464,
"grad_norm": 0.5200046300888062,
"learning_rate": 2.4154467440686523e-05,
"loss": 0.0706,
"step": 1930
},
{
"epoch": 0.9793033821302373,
"grad_norm": 0.46796679496765137,
"learning_rate": 2.4124179707218576e-05,
"loss": 0.0592,
"step": 1940
},
{
"epoch": 0.9843513377082281,
"grad_norm": 0.5713896751403809,
"learning_rate": 2.4093891973750633e-05,
"loss": 0.0586,
"step": 1950
},
{
"epoch": 0.9893992932862191,
"grad_norm": 0.9147453308105469,
"learning_rate": 2.4063604240282687e-05,
"loss": 0.0848,
"step": 1960
},
{
"epoch": 0.99444724886421,
"grad_norm": 1.1067036390304565,
"learning_rate": 2.4033316506814744e-05,
"loss": 0.07,
"step": 1970
},
{
"epoch": 0.9994952044422009,
"grad_norm": 0.5658775568008423,
"learning_rate": 2.4003028773346797e-05,
"loss": 0.0594,
"step": 1980
},
{
"epoch": 1.0,
"eval_f1": 0.9705180789481339,
"eval_loss": 0.04397369921207428,
"eval_runtime": 594.1594,
"eval_samples_per_second": 347.149,
"eval_steps_per_second": 2.713,
"step": 1981
},
{
"epoch": 1.0045431600201917,
"grad_norm": 0.6783074736595154,
"learning_rate": 2.3972741039878847e-05,
"loss": 0.0783,
"step": 1990
},
{
"epoch": 1.0095911155981827,
"grad_norm": 0.5741100311279297,
"learning_rate": 2.3942453306410904e-05,
"loss": 0.0612,
"step": 2000
},
{
"epoch": 1.0146390711761737,
"grad_norm": 0.8516017198562622,
"learning_rate": 2.3912165572942957e-05,
"loss": 0.0654,
"step": 2010
},
{
"epoch": 1.0196870267541647,
"grad_norm": 0.48648303747177124,
"learning_rate": 2.3881877839475014e-05,
"loss": 0.0659,
"step": 2020
},
{
"epoch": 1.0247349823321554,
"grad_norm": 0.48170068860054016,
"learning_rate": 2.3851590106007068e-05,
"loss": 0.0687,
"step": 2030
},
{
"epoch": 1.0297829379101464,
"grad_norm": 0.8060422539710999,
"learning_rate": 2.382130237253912e-05,
"loss": 0.0741,
"step": 2040
},
{
"epoch": 1.0348308934881374,
"grad_norm": 0.3721982538700104,
"learning_rate": 2.3791014639071178e-05,
"loss": 0.0643,
"step": 2050
},
{
"epoch": 1.0398788490661282,
"grad_norm": 0.9289938807487488,
"learning_rate": 2.376072690560323e-05,
"loss": 0.0678,
"step": 2060
},
{
"epoch": 1.0449268046441191,
"grad_norm": 0.7339480519294739,
"learning_rate": 2.3730439172135288e-05,
"loss": 0.065,
"step": 2070
},
{
"epoch": 1.0499747602221101,
"grad_norm": 0.5676091313362122,
"learning_rate": 2.370015143866734e-05,
"loss": 0.0665,
"step": 2080
},
{
"epoch": 1.0550227158001009,
"grad_norm": 1.0972354412078857,
"learning_rate": 2.3669863705199395e-05,
"loss": 0.0664,
"step": 2090
},
{
"epoch": 1.0600706713780919,
"grad_norm": 1.11980402469635,
"learning_rate": 2.3639575971731452e-05,
"loss": 0.0742,
"step": 2100
},
{
"epoch": 1.0651186269560828,
"grad_norm": 0.6586318016052246,
"learning_rate": 2.3609288238263502e-05,
"loss": 0.0755,
"step": 2110
},
{
"epoch": 1.0701665825340738,
"grad_norm": 0.6912874579429626,
"learning_rate": 2.3579000504795555e-05,
"loss": 0.0722,
"step": 2120
},
{
"epoch": 1.0752145381120646,
"grad_norm": 0.5603944659233093,
"learning_rate": 2.3548712771327612e-05,
"loss": 0.0636,
"step": 2130
},
{
"epoch": 1.0802624936900556,
"grad_norm": 0.7324510216712952,
"learning_rate": 2.3518425037859666e-05,
"loss": 0.0697,
"step": 2140
},
{
"epoch": 1.0853104492680465,
"grad_norm": 0.6833095550537109,
"learning_rate": 2.3488137304391723e-05,
"loss": 0.0678,
"step": 2150
},
{
"epoch": 1.0903584048460373,
"grad_norm": 0.49107661843299866,
"learning_rate": 2.3457849570923776e-05,
"loss": 0.0608,
"step": 2160
},
{
"epoch": 1.0954063604240283,
"grad_norm": 0.541980504989624,
"learning_rate": 2.342756183745583e-05,
"loss": 0.0645,
"step": 2170
},
{
"epoch": 1.1004543160020193,
"grad_norm": 0.487343966960907,
"learning_rate": 2.3397274103987886e-05,
"loss": 0.0573,
"step": 2180
},
{
"epoch": 1.10550227158001,
"grad_norm": 0.3503382205963135,
"learning_rate": 2.336698637051994e-05,
"loss": 0.0753,
"step": 2190
},
{
"epoch": 1.110550227158001,
"grad_norm": 0.750566840171814,
"learning_rate": 2.3336698637051997e-05,
"loss": 0.0703,
"step": 2200
},
{
"epoch": 1.115598182735992,
"grad_norm": 1.1437385082244873,
"learning_rate": 2.330641090358405e-05,
"loss": 0.0706,
"step": 2210
},
{
"epoch": 1.1206461383139827,
"grad_norm": 0.4508492648601532,
"learning_rate": 2.3276123170116103e-05,
"loss": 0.064,
"step": 2220
},
{
"epoch": 1.1256940938919737,
"grad_norm": 1.0053447484970093,
"learning_rate": 2.324583543664816e-05,
"loss": 0.0595,
"step": 2230
},
{
"epoch": 1.1307420494699647,
"grad_norm": 0.5974487662315369,
"learning_rate": 2.321554770318021e-05,
"loss": 0.0613,
"step": 2240
},
{
"epoch": 1.1357900050479555,
"grad_norm": 0.48302361369132996,
"learning_rate": 2.3185259969712267e-05,
"loss": 0.0553,
"step": 2250
},
{
"epoch": 1.1408379606259464,
"grad_norm": 0.7124462127685547,
"learning_rate": 2.315497223624432e-05,
"loss": 0.0628,
"step": 2260
},
{
"epoch": 1.1458859162039374,
"grad_norm": 0.8712441921234131,
"learning_rate": 2.3124684502776374e-05,
"loss": 0.066,
"step": 2270
},
{
"epoch": 1.1509338717819284,
"grad_norm": 0.7473580241203308,
"learning_rate": 2.309439676930843e-05,
"loss": 0.0687,
"step": 2280
},
{
"epoch": 1.1559818273599192,
"grad_norm": 0.8231186866760254,
"learning_rate": 2.3064109035840484e-05,
"loss": 0.0686,
"step": 2290
},
{
"epoch": 1.1610297829379101,
"grad_norm": 0.5205137729644775,
"learning_rate": 2.303382130237254e-05,
"loss": 0.0668,
"step": 2300
},
{
"epoch": 1.1660777385159011,
"grad_norm": 0.5173012614250183,
"learning_rate": 2.3003533568904595e-05,
"loss": 0.0664,
"step": 2310
},
{
"epoch": 1.171125694093892,
"grad_norm": 0.6976504325866699,
"learning_rate": 2.2973245835436648e-05,
"loss": 0.067,
"step": 2320
},
{
"epoch": 1.1761736496718829,
"grad_norm": 0.7795687317848206,
"learning_rate": 2.2942958101968705e-05,
"loss": 0.0591,
"step": 2330
},
{
"epoch": 1.1812216052498739,
"grad_norm": 0.35292479395866394,
"learning_rate": 2.291267036850076e-05,
"loss": 0.0721,
"step": 2340
},
{
"epoch": 1.1862695608278648,
"grad_norm": 1.548770546913147,
"learning_rate": 2.2882382635032815e-05,
"loss": 0.0608,
"step": 2350
},
{
"epoch": 1.1913175164058556,
"grad_norm": 0.521295964717865,
"learning_rate": 2.285209490156487e-05,
"loss": 0.0735,
"step": 2360
},
{
"epoch": 1.1963654719838466,
"grad_norm": 0.6001691818237305,
"learning_rate": 2.282180716809692e-05,
"loss": 0.0646,
"step": 2370
},
{
"epoch": 1.2014134275618376,
"grad_norm": 0.9061608910560608,
"learning_rate": 2.2791519434628976e-05,
"loss": 0.0598,
"step": 2380
},
{
"epoch": 1.2064613831398283,
"grad_norm": 0.6509453654289246,
"learning_rate": 2.276123170116103e-05,
"loss": 0.0591,
"step": 2390
},
{
"epoch": 1.2115093387178193,
"grad_norm": 0.4685826301574707,
"learning_rate": 2.2730943967693086e-05,
"loss": 0.0675,
"step": 2400
},
{
"epoch": 1.2165572942958103,
"grad_norm": 0.4527621865272522,
"learning_rate": 2.270065623422514e-05,
"loss": 0.0635,
"step": 2410
},
{
"epoch": 1.221605249873801,
"grad_norm": 0.46990010142326355,
"learning_rate": 2.2670368500757193e-05,
"loss": 0.0609,
"step": 2420
},
{
"epoch": 1.226653205451792,
"grad_norm": 0.7978981137275696,
"learning_rate": 2.264008076728925e-05,
"loss": 0.0682,
"step": 2430
},
{
"epoch": 1.231701161029783,
"grad_norm": 0.5001055598258972,
"learning_rate": 2.2609793033821303e-05,
"loss": 0.0657,
"step": 2440
},
{
"epoch": 1.2367491166077738,
"grad_norm": 0.7271714806556702,
"learning_rate": 2.2579505300353356e-05,
"loss": 0.0627,
"step": 2450
},
{
"epoch": 1.2417970721857647,
"grad_norm": 0.3601450026035309,
"learning_rate": 2.2549217566885413e-05,
"loss": 0.0649,
"step": 2460
},
{
"epoch": 1.2468450277637557,
"grad_norm": 0.6351629495620728,
"learning_rate": 2.2518929833417467e-05,
"loss": 0.0619,
"step": 2470
},
{
"epoch": 1.2518929833417465,
"grad_norm": 0.8523517847061157,
"learning_rate": 2.2488642099949524e-05,
"loss": 0.078,
"step": 2480
},
{
"epoch": 1.2569409389197375,
"grad_norm": 1.0878459215164185,
"learning_rate": 2.2458354366481577e-05,
"loss": 0.0636,
"step": 2490
},
{
"epoch": 1.2619888944977284,
"grad_norm": 0.6811727285385132,
"learning_rate": 2.2428066633013627e-05,
"loss": 0.0703,
"step": 2500
},
{
"epoch": 1.2670368500757192,
"grad_norm": 0.6043427586555481,
"learning_rate": 2.2397778899545684e-05,
"loss": 0.0587,
"step": 2510
},
{
"epoch": 1.2720848056537102,
"grad_norm": 0.6673144102096558,
"learning_rate": 2.2367491166077737e-05,
"loss": 0.0675,
"step": 2520
},
{
"epoch": 1.2771327612317012,
"grad_norm": 0.3510701358318329,
"learning_rate": 2.2337203432609794e-05,
"loss": 0.069,
"step": 2530
},
{
"epoch": 1.2821807168096921,
"grad_norm": 0.302438884973526,
"learning_rate": 2.2306915699141848e-05,
"loss": 0.0609,
"step": 2540
},
{
"epoch": 1.2872286723876831,
"grad_norm": 0.8073706030845642,
"learning_rate": 2.22766279656739e-05,
"loss": 0.076,
"step": 2550
},
{
"epoch": 1.2922766279656739,
"grad_norm": 0.7314086556434631,
"learning_rate": 2.2246340232205958e-05,
"loss": 0.0676,
"step": 2560
},
{
"epoch": 1.2973245835436649,
"grad_norm": 0.6998431086540222,
"learning_rate": 2.221605249873801e-05,
"loss": 0.0594,
"step": 2570
},
{
"epoch": 1.3023725391216558,
"grad_norm": 0.9340649843215942,
"learning_rate": 2.2185764765270068e-05,
"loss": 0.0601,
"step": 2580
},
{
"epoch": 1.3074204946996466,
"grad_norm": 0.5486651062965393,
"learning_rate": 2.215547703180212e-05,
"loss": 0.0752,
"step": 2590
},
{
"epoch": 1.3124684502776376,
"grad_norm": 0.3997117280960083,
"learning_rate": 2.2125189298334175e-05,
"loss": 0.0669,
"step": 2600
},
{
"epoch": 1.3175164058556286,
"grad_norm": 0.6159607172012329,
"learning_rate": 2.2094901564866232e-05,
"loss": 0.0646,
"step": 2610
},
{
"epoch": 1.3225643614336193,
"grad_norm": 1.0720511674880981,
"learning_rate": 2.2064613831398285e-05,
"loss": 0.0697,
"step": 2620
},
{
"epoch": 1.3276123170116103,
"grad_norm": 0.6496064066886902,
"learning_rate": 2.203432609793034e-05,
"loss": 0.0642,
"step": 2630
},
{
"epoch": 1.3326602725896013,
"grad_norm": 0.5649464726448059,
"learning_rate": 2.2004038364462392e-05,
"loss": 0.0596,
"step": 2640
},
{
"epoch": 1.337708228167592,
"grad_norm": 0.5532758235931396,
"learning_rate": 2.1973750630994446e-05,
"loss": 0.0651,
"step": 2650
},
{
"epoch": 1.342756183745583,
"grad_norm": 0.4955766797065735,
"learning_rate": 2.1943462897526503e-05,
"loss": 0.0661,
"step": 2660
},
{
"epoch": 1.347804139323574,
"grad_norm": 0.5403378009796143,
"learning_rate": 2.1913175164058556e-05,
"loss": 0.068,
"step": 2670
},
{
"epoch": 1.3528520949015648,
"grad_norm": 0.8987810015678406,
"learning_rate": 2.1882887430590613e-05,
"loss": 0.0551,
"step": 2680
},
{
"epoch": 1.3579000504795558,
"grad_norm": 0.5531570911407471,
"learning_rate": 2.1852599697122666e-05,
"loss": 0.0554,
"step": 2690
},
{
"epoch": 1.3629480060575467,
"grad_norm": 0.8810332417488098,
"learning_rate": 2.182231196365472e-05,
"loss": 0.0683,
"step": 2700
},
{
"epoch": 1.3679959616355375,
"grad_norm": 0.8977289199829102,
"learning_rate": 2.1792024230186777e-05,
"loss": 0.0682,
"step": 2710
},
{
"epoch": 1.3730439172135285,
"grad_norm": 0.6664491295814514,
"learning_rate": 2.176173649671883e-05,
"loss": 0.0652,
"step": 2720
},
{
"epoch": 1.3780918727915195,
"grad_norm": 0.7725427150726318,
"learning_rate": 2.1731448763250883e-05,
"loss": 0.0693,
"step": 2730
},
{
"epoch": 1.3831398283695102,
"grad_norm": 1.149824857711792,
"learning_rate": 2.170116102978294e-05,
"loss": 0.0697,
"step": 2740
},
{
"epoch": 1.3881877839475012,
"grad_norm": 0.8231659531593323,
"learning_rate": 2.167087329631499e-05,
"loss": 0.0586,
"step": 2750
},
{
"epoch": 1.3932357395254922,
"grad_norm": 0.5706813335418701,
"learning_rate": 2.1640585562847047e-05,
"loss": 0.0648,
"step": 2760
},
{
"epoch": 1.3982836951034832,
"grad_norm": 0.4602285623550415,
"learning_rate": 2.16102978293791e-05,
"loss": 0.0642,
"step": 2770
},
{
"epoch": 1.4033316506814741,
"grad_norm": 0.5022104978561401,
"learning_rate": 2.1580010095911154e-05,
"loss": 0.0582,
"step": 2780
},
{
"epoch": 1.408379606259465,
"grad_norm": 0.3675612211227417,
"learning_rate": 2.154972236244321e-05,
"loss": 0.0685,
"step": 2790
},
{
"epoch": 1.4134275618374559,
"grad_norm": 0.5692434906959534,
"learning_rate": 2.1519434628975264e-05,
"loss": 0.0625,
"step": 2800
},
{
"epoch": 1.4184755174154469,
"grad_norm": 0.44433364272117615,
"learning_rate": 2.148914689550732e-05,
"loss": 0.0683,
"step": 2810
},
{
"epoch": 1.4235234729934376,
"grad_norm": 0.5225184559822083,
"learning_rate": 2.1458859162039375e-05,
"loss": 0.0676,
"step": 2820
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.125475287437439,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.0641,
"step": 2830
},
{
"epoch": 1.4336193841494196,
"grad_norm": 0.6783428192138672,
"learning_rate": 2.1398283695103485e-05,
"loss": 0.0735,
"step": 2840
},
{
"epoch": 1.4386673397274103,
"grad_norm": 0.6056823134422302,
"learning_rate": 2.136799596163554e-05,
"loss": 0.0607,
"step": 2850
},
{
"epoch": 1.4437152953054013,
"grad_norm": 0.7588714361190796,
"learning_rate": 2.1337708228167595e-05,
"loss": 0.0638,
"step": 2860
},
{
"epoch": 1.4487632508833923,
"grad_norm": 0.5353738069534302,
"learning_rate": 2.130742049469965e-05,
"loss": 0.0628,
"step": 2870
},
{
"epoch": 1.453811206461383,
"grad_norm": 0.3690322935581207,
"learning_rate": 2.12771327612317e-05,
"loss": 0.055,
"step": 2880
},
{
"epoch": 1.458859162039374,
"grad_norm": 0.5556847453117371,
"learning_rate": 2.1246845027763756e-05,
"loss": 0.0672,
"step": 2890
},
{
"epoch": 1.463907117617365,
"grad_norm": 0.5658410787582397,
"learning_rate": 2.121655729429581e-05,
"loss": 0.0634,
"step": 2900
},
{
"epoch": 1.4689550731953558,
"grad_norm": 1.1000596284866333,
"learning_rate": 2.1186269560827866e-05,
"loss": 0.0648,
"step": 2910
},
{
"epoch": 1.4740030287733468,
"grad_norm": 0.5739458799362183,
"learning_rate": 2.115598182735992e-05,
"loss": 0.0622,
"step": 2920
},
{
"epoch": 1.4790509843513377,
"grad_norm": 0.9371837377548218,
"learning_rate": 2.1125694093891973e-05,
"loss": 0.067,
"step": 2930
},
{
"epoch": 1.4840989399293285,
"grad_norm": 0.5997252464294434,
"learning_rate": 2.109540636042403e-05,
"loss": 0.0665,
"step": 2940
},
{
"epoch": 1.4891468955073195,
"grad_norm": 0.6729413866996765,
"learning_rate": 2.1065118626956083e-05,
"loss": 0.0576,
"step": 2950
},
{
"epoch": 1.4941948510853105,
"grad_norm": 0.796592652797699,
"learning_rate": 2.103483089348814e-05,
"loss": 0.0671,
"step": 2960
},
{
"epoch": 1.4992428066633012,
"grad_norm": 0.7947612404823303,
"learning_rate": 2.1004543160020193e-05,
"loss": 0.0701,
"step": 2970
},
{
"epoch": 1.5042907622412924,
"grad_norm": 0.7790849208831787,
"learning_rate": 2.0974255426552247e-05,
"loss": 0.065,
"step": 2980
},
{
"epoch": 1.5093387178192832,
"grad_norm": 0.5330706238746643,
"learning_rate": 2.0943967693084304e-05,
"loss": 0.0587,
"step": 2990
},
{
"epoch": 1.514386673397274,
"grad_norm": 1.0482598543167114,
"learning_rate": 2.0913679959616357e-05,
"loss": 0.0696,
"step": 3000
},
{
"epoch": 1.5194346289752652,
"grad_norm": 0.46928080916404724,
"learning_rate": 2.088339222614841e-05,
"loss": 0.0668,
"step": 3010
},
{
"epoch": 1.524482584553256,
"grad_norm": 1.0525529384613037,
"learning_rate": 2.0853104492680464e-05,
"loss": 0.0664,
"step": 3020
},
{
"epoch": 1.529530540131247,
"grad_norm": 0.43941500782966614,
"learning_rate": 2.0822816759212517e-05,
"loss": 0.0642,
"step": 3030
},
{
"epoch": 1.5345784957092379,
"grad_norm": 0.6985353231430054,
"learning_rate": 2.0792529025744574e-05,
"loss": 0.068,
"step": 3040
},
{
"epoch": 1.5396264512872286,
"grad_norm": 0.6110888123512268,
"learning_rate": 2.0762241292276628e-05,
"loss": 0.0639,
"step": 3050
},
{
"epoch": 1.5446744068652196,
"grad_norm": 0.8250141739845276,
"learning_rate": 2.073195355880868e-05,
"loss": 0.0614,
"step": 3060
},
{
"epoch": 1.5497223624432106,
"grad_norm": 0.4882888197898865,
"learning_rate": 2.0701665825340738e-05,
"loss": 0.066,
"step": 3070
},
{
"epoch": 1.5547703180212014,
"grad_norm": 0.38679155707359314,
"learning_rate": 2.067137809187279e-05,
"loss": 0.0684,
"step": 3080
},
{
"epoch": 1.5598182735991923,
"grad_norm": 0.6574121117591858,
"learning_rate": 2.0641090358404848e-05,
"loss": 0.0666,
"step": 3090
},
{
"epoch": 1.5648662291771833,
"grad_norm": 0.48571038246154785,
"learning_rate": 2.0610802624936902e-05,
"loss": 0.0646,
"step": 3100
},
{
"epoch": 1.569914184755174,
"grad_norm": 0.8285214304924011,
"learning_rate": 2.0580514891468955e-05,
"loss": 0.0634,
"step": 3110
},
{
"epoch": 1.574962140333165,
"grad_norm": 0.5619475245475769,
"learning_rate": 2.0550227158001012e-05,
"loss": 0.0665,
"step": 3120
},
{
"epoch": 1.580010095911156,
"grad_norm": 0.47569337487220764,
"learning_rate": 2.0519939424533065e-05,
"loss": 0.0661,
"step": 3130
},
{
"epoch": 1.5850580514891468,
"grad_norm": 0.8858407139778137,
"learning_rate": 2.048965169106512e-05,
"loss": 0.0696,
"step": 3140
},
{
"epoch": 1.5901060070671378,
"grad_norm": 0.5578007698059082,
"learning_rate": 2.0459363957597172e-05,
"loss": 0.0547,
"step": 3150
},
{
"epoch": 1.5951539626451288,
"grad_norm": 0.6875492334365845,
"learning_rate": 2.0429076224129226e-05,
"loss": 0.0608,
"step": 3160
},
{
"epoch": 1.6002019182231195,
"grad_norm": 0.5009766221046448,
"learning_rate": 2.0398788490661283e-05,
"loss": 0.0684,
"step": 3170
},
{
"epoch": 1.6052498738011105,
"grad_norm": 0.7467596530914307,
"learning_rate": 2.0368500757193336e-05,
"loss": 0.0654,
"step": 3180
},
{
"epoch": 1.6102978293791015,
"grad_norm": 0.5688017010688782,
"learning_rate": 2.0338213023725393e-05,
"loss": 0.0594,
"step": 3190
},
{
"epoch": 1.6153457849570922,
"grad_norm": 0.9353786110877991,
"learning_rate": 2.0307925290257446e-05,
"loss": 0.0685,
"step": 3200
},
{
"epoch": 1.6203937405350834,
"grad_norm": 0.5310063362121582,
"learning_rate": 2.02776375567895e-05,
"loss": 0.0597,
"step": 3210
},
{
"epoch": 1.6254416961130742,
"grad_norm": 1.107693076133728,
"learning_rate": 2.0247349823321557e-05,
"loss": 0.0722,
"step": 3220
},
{
"epoch": 1.630489651691065,
"grad_norm": 0.688391923904419,
"learning_rate": 2.021706208985361e-05,
"loss": 0.0719,
"step": 3230
},
{
"epoch": 1.6355376072690562,
"grad_norm": 0.4255257546901703,
"learning_rate": 2.0186774356385667e-05,
"loss": 0.0638,
"step": 3240
},
{
"epoch": 1.640585562847047,
"grad_norm": 0.6049216389656067,
"learning_rate": 2.015648662291772e-05,
"loss": 0.0555,
"step": 3250
},
{
"epoch": 1.645633518425038,
"grad_norm": 0.6898351311683655,
"learning_rate": 2.012619888944977e-05,
"loss": 0.0599,
"step": 3260
},
{
"epoch": 1.650681474003029,
"grad_norm": 0.6150475144386292,
"learning_rate": 2.0095911155981827e-05,
"loss": 0.0664,
"step": 3270
},
{
"epoch": 1.6557294295810197,
"grad_norm": 0.5084889531135559,
"learning_rate": 2.006562342251388e-05,
"loss": 0.0574,
"step": 3280
},
{
"epoch": 1.6607773851590106,
"grad_norm": 0.9478010535240173,
"learning_rate": 2.0035335689045938e-05,
"loss": 0.0619,
"step": 3290
},
{
"epoch": 1.6658253407370016,
"grad_norm": 1.1725986003875732,
"learning_rate": 2.000504795557799e-05,
"loss": 0.0672,
"step": 3300
},
{
"epoch": 1.6708732963149924,
"grad_norm": 0.8932427763938904,
"learning_rate": 1.9974760222110044e-05,
"loss": 0.0604,
"step": 3310
},
{
"epoch": 1.6759212518929834,
"grad_norm": 0.4670265316963196,
"learning_rate": 1.99444724886421e-05,
"loss": 0.0658,
"step": 3320
},
{
"epoch": 1.6809692074709743,
"grad_norm": 0.518844485282898,
"learning_rate": 1.9914184755174155e-05,
"loss": 0.068,
"step": 3330
},
{
"epoch": 1.686017163048965,
"grad_norm": 0.7717642784118652,
"learning_rate": 1.988389702170621e-05,
"loss": 0.0594,
"step": 3340
},
{
"epoch": 1.691065118626956,
"grad_norm": 0.9715004563331604,
"learning_rate": 1.9853609288238265e-05,
"loss": 0.0651,
"step": 3350
},
{
"epoch": 1.696113074204947,
"grad_norm": 0.7362111210823059,
"learning_rate": 1.982332155477032e-05,
"loss": 0.0664,
"step": 3360
},
{
"epoch": 1.7011610297829378,
"grad_norm": 0.480751633644104,
"learning_rate": 1.9793033821302375e-05,
"loss": 0.0609,
"step": 3370
},
{
"epoch": 1.7062089853609288,
"grad_norm": 0.31802135705947876,
"learning_rate": 1.976274608783443e-05,
"loss": 0.0658,
"step": 3380
},
{
"epoch": 1.7112569409389198,
"grad_norm": 0.5285906195640564,
"learning_rate": 1.973245835436648e-05,
"loss": 0.0606,
"step": 3390
},
{
"epoch": 1.7163048965169105,
"grad_norm": 0.7230745553970337,
"learning_rate": 1.9702170620898536e-05,
"loss": 0.0618,
"step": 3400
},
{
"epoch": 1.7213528520949015,
"grad_norm": 0.566842257976532,
"learning_rate": 1.967188288743059e-05,
"loss": 0.0623,
"step": 3410
},
{
"epoch": 1.7264008076728925,
"grad_norm": 0.9110565781593323,
"learning_rate": 1.9641595153962646e-05,
"loss": 0.0712,
"step": 3420
},
{
"epoch": 1.7314487632508833,
"grad_norm": 0.5621252059936523,
"learning_rate": 1.96113074204947e-05,
"loss": 0.0624,
"step": 3430
},
{
"epoch": 1.7364967188288745,
"grad_norm": 0.6153441667556763,
"learning_rate": 1.9581019687026753e-05,
"loss": 0.0679,
"step": 3440
},
{
"epoch": 1.7415446744068652,
"grad_norm": 0.7521117925643921,
"learning_rate": 1.955073195355881e-05,
"loss": 0.073,
"step": 3450
},
{
"epoch": 1.746592629984856,
"grad_norm": 0.7781336307525635,
"learning_rate": 1.9520444220090863e-05,
"loss": 0.0576,
"step": 3460
},
{
"epoch": 1.7516405855628472,
"grad_norm": 0.5981038808822632,
"learning_rate": 1.949015648662292e-05,
"loss": 0.0558,
"step": 3470
},
{
"epoch": 1.756688541140838,
"grad_norm": 0.5716273188591003,
"learning_rate": 1.9459868753154973e-05,
"loss": 0.0615,
"step": 3480
},
{
"epoch": 1.761736496718829,
"grad_norm": 1.0969016551971436,
"learning_rate": 1.9429581019687027e-05,
"loss": 0.0695,
"step": 3490
},
{
"epoch": 1.76678445229682,
"grad_norm": 0.4081050157546997,
"learning_rate": 1.9399293286219084e-05,
"loss": 0.0569,
"step": 3500
},
{
"epoch": 1.7718324078748107,
"grad_norm": 0.6996564269065857,
"learning_rate": 1.9369005552751137e-05,
"loss": 0.0615,
"step": 3510
},
{
"epoch": 1.7768803634528016,
"grad_norm": 0.7040839791297913,
"learning_rate": 1.933871781928319e-05,
"loss": 0.0609,
"step": 3520
},
{
"epoch": 1.7819283190307926,
"grad_norm": 0.6955099105834961,
"learning_rate": 1.9308430085815244e-05,
"loss": 0.0596,
"step": 3530
},
{
"epoch": 1.7869762746087834,
"grad_norm": 0.49400514364242554,
"learning_rate": 1.9278142352347298e-05,
"loss": 0.0531,
"step": 3540
},
{
"epoch": 1.7920242301867744,
"grad_norm": 0.6069557666778564,
"learning_rate": 1.9247854618879354e-05,
"loss": 0.0663,
"step": 3550
},
{
"epoch": 1.7970721857647654,
"grad_norm": 0.859195351600647,
"learning_rate": 1.9217566885411408e-05,
"loss": 0.0539,
"step": 3560
},
{
"epoch": 1.802120141342756,
"grad_norm": 0.8939780592918396,
"learning_rate": 1.9187279151943465e-05,
"loss": 0.0668,
"step": 3570
},
{
"epoch": 1.807168096920747,
"grad_norm": 0.7258803248405457,
"learning_rate": 1.9156991418475518e-05,
"loss": 0.0585,
"step": 3580
},
{
"epoch": 1.812216052498738,
"grad_norm": 0.38900288939476013,
"learning_rate": 1.912670368500757e-05,
"loss": 0.0686,
"step": 3590
},
{
"epoch": 1.8172640080767288,
"grad_norm": 0.38506415486335754,
"learning_rate": 1.909641595153963e-05,
"loss": 0.0625,
"step": 3600
},
{
"epoch": 1.8223119636547198,
"grad_norm": 0.5235381722450256,
"learning_rate": 1.9066128218071682e-05,
"loss": 0.0597,
"step": 3610
},
{
"epoch": 1.8273599192327108,
"grad_norm": 0.4835253357887268,
"learning_rate": 1.903584048460374e-05,
"loss": 0.0667,
"step": 3620
},
{
"epoch": 1.8324078748107016,
"grad_norm": 0.6338971257209778,
"learning_rate": 1.9005552751135792e-05,
"loss": 0.0635,
"step": 3630
},
{
"epoch": 1.8374558303886925,
"grad_norm": 1.0663739442825317,
"learning_rate": 1.8975265017667846e-05,
"loss": 0.0744,
"step": 3640
},
{
"epoch": 1.8425037859666835,
"grad_norm": 0.6655123829841614,
"learning_rate": 1.89449772841999e-05,
"loss": 0.0654,
"step": 3650
},
{
"epoch": 1.8475517415446743,
"grad_norm": 0.582611083984375,
"learning_rate": 1.8914689550731952e-05,
"loss": 0.0661,
"step": 3660
},
{
"epoch": 1.8525996971226655,
"grad_norm": 0.6533240079879761,
"learning_rate": 1.888440181726401e-05,
"loss": 0.0613,
"step": 3670
},
{
"epoch": 1.8576476527006562,
"grad_norm": 0.4978090524673462,
"learning_rate": 1.8854114083796063e-05,
"loss": 0.0627,
"step": 3680
},
{
"epoch": 1.862695608278647,
"grad_norm": 0.7043678164482117,
"learning_rate": 1.8823826350328116e-05,
"loss": 0.0578,
"step": 3690
},
{
"epoch": 1.8677435638566382,
"grad_norm": 0.7941015362739563,
"learning_rate": 1.8793538616860173e-05,
"loss": 0.0622,
"step": 3700
},
{
"epoch": 1.872791519434629,
"grad_norm": 0.4428146183490753,
"learning_rate": 1.8763250883392226e-05,
"loss": 0.0613,
"step": 3710
},
{
"epoch": 1.87783947501262,
"grad_norm": 0.6554248929023743,
"learning_rate": 1.873296314992428e-05,
"loss": 0.0643,
"step": 3720
},
{
"epoch": 1.882887430590611,
"grad_norm": 0.48168087005615234,
"learning_rate": 1.8702675416456337e-05,
"loss": 0.055,
"step": 3730
},
{
"epoch": 1.8879353861686017,
"grad_norm": 0.509777307510376,
"learning_rate": 1.867238768298839e-05,
"loss": 0.058,
"step": 3740
},
{
"epoch": 1.8929833417465927,
"grad_norm": 0.5132505893707275,
"learning_rate": 1.8642099949520447e-05,
"loss": 0.0623,
"step": 3750
},
{
"epoch": 1.8980312973245836,
"grad_norm": 0.7474920749664307,
"learning_rate": 1.86118122160525e-05,
"loss": 0.0489,
"step": 3760
},
{
"epoch": 1.9030792529025744,
"grad_norm": 1.0404279232025146,
"learning_rate": 1.8581524482584554e-05,
"loss": 0.0687,
"step": 3770
},
{
"epoch": 1.9081272084805654,
"grad_norm": 0.6796401143074036,
"learning_rate": 1.8551236749116607e-05,
"loss": 0.0679,
"step": 3780
},
{
"epoch": 1.9131751640585564,
"grad_norm": 0.9071604609489441,
"learning_rate": 1.852094901564866e-05,
"loss": 0.0725,
"step": 3790
},
{
"epoch": 1.9182231196365471,
"grad_norm": 0.7023878693580627,
"learning_rate": 1.8490661282180718e-05,
"loss": 0.0702,
"step": 3800
},
{
"epoch": 1.923271075214538,
"grad_norm": 0.7312602996826172,
"learning_rate": 1.846037354871277e-05,
"loss": 0.0532,
"step": 3810
},
{
"epoch": 1.928319030792529,
"grad_norm": 0.6224806904792786,
"learning_rate": 1.8430085815244825e-05,
"loss": 0.0638,
"step": 3820
},
{
"epoch": 1.9333669863705198,
"grad_norm": 0.7255429029464722,
"learning_rate": 1.839979808177688e-05,
"loss": 0.0641,
"step": 3830
},
{
"epoch": 1.9384149419485108,
"grad_norm": 0.584086000919342,
"learning_rate": 1.8369510348308935e-05,
"loss": 0.0692,
"step": 3840
},
{
"epoch": 1.9434628975265018,
"grad_norm": 0.4826408326625824,
"learning_rate": 1.833922261484099e-05,
"loss": 0.0627,
"step": 3850
},
{
"epoch": 1.9485108531044926,
"grad_norm": 0.5803766846656799,
"learning_rate": 1.8308934881373045e-05,
"loss": 0.0635,
"step": 3860
},
{
"epoch": 1.9535588086824835,
"grad_norm": 0.7855948209762573,
"learning_rate": 1.82786471479051e-05,
"loss": 0.0659,
"step": 3870
},
{
"epoch": 1.9586067642604745,
"grad_norm": 0.5980962514877319,
"learning_rate": 1.8248359414437155e-05,
"loss": 0.0651,
"step": 3880
},
{
"epoch": 1.9636547198384653,
"grad_norm": 0.6440220475196838,
"learning_rate": 1.821807168096921e-05,
"loss": 0.0639,
"step": 3890
},
{
"epoch": 1.9687026754164565,
"grad_norm": 0.7104585766792297,
"learning_rate": 1.8187783947501262e-05,
"loss": 0.056,
"step": 3900
},
{
"epoch": 1.9737506309944473,
"grad_norm": 0.7219833731651306,
"learning_rate": 1.8157496214033316e-05,
"loss": 0.0574,
"step": 3910
},
{
"epoch": 1.978798586572438,
"grad_norm": 0.5478711724281311,
"learning_rate": 1.812720848056537e-05,
"loss": 0.0657,
"step": 3920
},
{
"epoch": 1.9838465421504292,
"grad_norm": 0.6501402854919434,
"learning_rate": 1.8096920747097426e-05,
"loss": 0.0641,
"step": 3930
},
{
"epoch": 1.98889449772842,
"grad_norm": 0.7231020331382751,
"learning_rate": 1.806663301362948e-05,
"loss": 0.0692,
"step": 3940
},
{
"epoch": 1.993942453306411,
"grad_norm": 0.6480854749679565,
"learning_rate": 1.8036345280161536e-05,
"loss": 0.0632,
"step": 3950
},
{
"epoch": 1.998990408884402,
"grad_norm": 0.4803590774536133,
"learning_rate": 1.800605754669359e-05,
"loss": 0.0678,
"step": 3960
},
{
"epoch": 2.0,
"eval_f1": 0.9705180789481339,
"eval_loss": 0.0446692518889904,
"eval_runtime": 584.4017,
"eval_samples_per_second": 352.946,
"eval_steps_per_second": 2.758,
"step": 3962
},
{
"epoch": 2.0040383644623927,
"grad_norm": 0.680855393409729,
"learning_rate": 1.7975769813225643e-05,
"loss": 0.0567,
"step": 3970
},
{
"epoch": 2.0090863200403835,
"grad_norm": 0.47991836071014404,
"learning_rate": 1.79454820797577e-05,
"loss": 0.0562,
"step": 3980
},
{
"epoch": 2.0141342756183747,
"grad_norm": 0.8615912199020386,
"learning_rate": 1.7915194346289753e-05,
"loss": 0.0679,
"step": 3990
},
{
"epoch": 2.0191822311963654,
"grad_norm": 0.5970327258110046,
"learning_rate": 1.7884906612821807e-05,
"loss": 0.053,
"step": 4000
},
{
"epoch": 2.024230186774356,
"grad_norm": 0.5402255654335022,
"learning_rate": 1.7854618879353864e-05,
"loss": 0.0574,
"step": 4010
},
{
"epoch": 2.0292781423523474,
"grad_norm": 0.5014840364456177,
"learning_rate": 1.7824331145885917e-05,
"loss": 0.0649,
"step": 4020
},
{
"epoch": 2.034326097930338,
"grad_norm": 0.7147154808044434,
"learning_rate": 1.779404341241797e-05,
"loss": 0.0687,
"step": 4030
},
{
"epoch": 2.0393740535083293,
"grad_norm": 0.5346552729606628,
"learning_rate": 1.7763755678950024e-05,
"loss": 0.0638,
"step": 4040
},
{
"epoch": 2.04442200908632,
"grad_norm": 0.5596599578857422,
"learning_rate": 1.7733467945482078e-05,
"loss": 0.0669,
"step": 4050
},
{
"epoch": 2.049469964664311,
"grad_norm": 0.40591198205947876,
"learning_rate": 1.7703180212014134e-05,
"loss": 0.0564,
"step": 4060
},
{
"epoch": 2.054517920242302,
"grad_norm": 0.609337568283081,
"learning_rate": 1.7672892478546188e-05,
"loss": 0.0576,
"step": 4070
},
{
"epoch": 2.059565875820293,
"grad_norm": 0.5424002408981323,
"learning_rate": 1.7642604745078245e-05,
"loss": 0.0585,
"step": 4080
},
{
"epoch": 2.0646138313982836,
"grad_norm": 0.9868631362915039,
"learning_rate": 1.7612317011610298e-05,
"loss": 0.0684,
"step": 4090
},
{
"epoch": 2.069661786976275,
"grad_norm": 0.6492929458618164,
"learning_rate": 1.758202927814235e-05,
"loss": 0.0638,
"step": 4100
},
{
"epoch": 2.0747097425542655,
"grad_norm": 0.7837685346603394,
"learning_rate": 1.755174154467441e-05,
"loss": 0.0675,
"step": 4110
},
{
"epoch": 2.0797576981322563,
"grad_norm": 0.5961639881134033,
"learning_rate": 1.7521453811206462e-05,
"loss": 0.0575,
"step": 4120
},
{
"epoch": 2.0848056537102475,
"grad_norm": 0.4114825427532196,
"learning_rate": 1.749116607773852e-05,
"loss": 0.0659,
"step": 4130
},
{
"epoch": 2.0898536092882383,
"grad_norm": 0.4567316174507141,
"learning_rate": 1.7460878344270572e-05,
"loss": 0.0661,
"step": 4140
},
{
"epoch": 2.094901564866229,
"grad_norm": 0.6321776509284973,
"learning_rate": 1.7430590610802626e-05,
"loss": 0.066,
"step": 4150
},
{
"epoch": 2.0999495204442202,
"grad_norm": 0.8911116719245911,
"learning_rate": 1.740030287733468e-05,
"loss": 0.0585,
"step": 4160
},
{
"epoch": 2.104997476022211,
"grad_norm": 0.4896914064884186,
"learning_rate": 1.7370015143866733e-05,
"loss": 0.0612,
"step": 4170
},
{
"epoch": 2.1100454316002017,
"grad_norm": 0.7571251392364502,
"learning_rate": 1.733972741039879e-05,
"loss": 0.0563,
"step": 4180
},
{
"epoch": 2.115093387178193,
"grad_norm": 0.9115099310874939,
"learning_rate": 1.7309439676930843e-05,
"loss": 0.0698,
"step": 4190
},
{
"epoch": 2.1201413427561837,
"grad_norm": 0.5267325639724731,
"learning_rate": 1.7279151943462896e-05,
"loss": 0.0604,
"step": 4200
},
{
"epoch": 2.1251892983341745,
"grad_norm": 0.6659255623817444,
"learning_rate": 1.7248864209994953e-05,
"loss": 0.0627,
"step": 4210
},
{
"epoch": 2.1302372539121657,
"grad_norm": 0.89178466796875,
"learning_rate": 1.7218576476527007e-05,
"loss": 0.0552,
"step": 4220
},
{
"epoch": 2.1352852094901564,
"grad_norm": 0.4615127742290497,
"learning_rate": 1.7188288743059063e-05,
"loss": 0.0557,
"step": 4230
},
{
"epoch": 2.1403331650681476,
"grad_norm": 0.6602596044540405,
"learning_rate": 1.7158001009591117e-05,
"loss": 0.0548,
"step": 4240
},
{
"epoch": 2.1453811206461384,
"grad_norm": 0.7081389427185059,
"learning_rate": 1.712771327612317e-05,
"loss": 0.0606,
"step": 4250
},
{
"epoch": 2.150429076224129,
"grad_norm": 0.5817338824272156,
"learning_rate": 1.7097425542655227e-05,
"loss": 0.0606,
"step": 4260
},
{
"epoch": 2.1554770318021204,
"grad_norm": 0.4401390254497528,
"learning_rate": 1.706713780918728e-05,
"loss": 0.0607,
"step": 4270
},
{
"epoch": 2.160524987380111,
"grad_norm": 1.0127087831497192,
"learning_rate": 1.7036850075719337e-05,
"loss": 0.0615,
"step": 4280
},
{
"epoch": 2.165572942958102,
"grad_norm": 0.5774319171905518,
"learning_rate": 1.7006562342251387e-05,
"loss": 0.0525,
"step": 4290
},
{
"epoch": 2.170620898536093,
"grad_norm": 0.47623270750045776,
"learning_rate": 1.697627460878344e-05,
"loss": 0.0591,
"step": 4300
},
{
"epoch": 2.175668854114084,
"grad_norm": 0.7083358764648438,
"learning_rate": 1.6945986875315498e-05,
"loss": 0.0631,
"step": 4310
},
{
"epoch": 2.1807168096920746,
"grad_norm": 0.6057601571083069,
"learning_rate": 1.691569914184755e-05,
"loss": 0.0595,
"step": 4320
},
{
"epoch": 2.185764765270066,
"grad_norm": 0.8947880864143372,
"learning_rate": 1.6885411408379605e-05,
"loss": 0.0666,
"step": 4330
},
{
"epoch": 2.1908127208480566,
"grad_norm": 0.6460204720497131,
"learning_rate": 1.685512367491166e-05,
"loss": 0.0669,
"step": 4340
},
{
"epoch": 2.1958606764260473,
"grad_norm": 0.9029686450958252,
"learning_rate": 1.6824835941443715e-05,
"loss": 0.0607,
"step": 4350
},
{
"epoch": 2.2009086320040385,
"grad_norm": 0.5201438665390015,
"learning_rate": 1.6794548207975772e-05,
"loss": 0.0514,
"step": 4360
},
{
"epoch": 2.2059565875820293,
"grad_norm": 0.39414748549461365,
"learning_rate": 1.6764260474507825e-05,
"loss": 0.0581,
"step": 4370
},
{
"epoch": 2.21100454316002,
"grad_norm": 0.642257034778595,
"learning_rate": 1.673397274103988e-05,
"loss": 0.0611,
"step": 4380
},
{
"epoch": 2.2160524987380112,
"grad_norm": 0.7225739359855652,
"learning_rate": 1.6703685007571935e-05,
"loss": 0.0569,
"step": 4390
},
{
"epoch": 2.221100454316002,
"grad_norm": 0.6948502659797668,
"learning_rate": 1.667339727410399e-05,
"loss": 0.0652,
"step": 4400
},
{
"epoch": 2.2261484098939928,
"grad_norm": 0.5755937695503235,
"learning_rate": 1.6643109540636042e-05,
"loss": 0.0566,
"step": 4410
},
{
"epoch": 2.231196365471984,
"grad_norm": 0.4249815046787262,
"learning_rate": 1.6612821807168096e-05,
"loss": 0.0642,
"step": 4420
},
{
"epoch": 2.2362443210499747,
"grad_norm": 0.5442089438438416,
"learning_rate": 1.658253407370015e-05,
"loss": 0.0685,
"step": 4430
},
{
"epoch": 2.2412922766279655,
"grad_norm": 0.8074495792388916,
"learning_rate": 1.6552246340232206e-05,
"loss": 0.0558,
"step": 4440
},
{
"epoch": 2.2463402322059567,
"grad_norm": 0.8810071349143982,
"learning_rate": 1.652195860676426e-05,
"loss": 0.0685,
"step": 4450
},
{
"epoch": 2.2513881877839474,
"grad_norm": 0.5399377942085266,
"learning_rate": 1.6491670873296316e-05,
"loss": 0.0607,
"step": 4460
},
{
"epoch": 2.256436143361938,
"grad_norm": 0.7178535461425781,
"learning_rate": 1.646138313982837e-05,
"loss": 0.0504,
"step": 4470
},
{
"epoch": 2.2614840989399294,
"grad_norm": 0.4272046983242035,
"learning_rate": 1.6431095406360423e-05,
"loss": 0.0583,
"step": 4480
},
{
"epoch": 2.26653205451792,
"grad_norm": 0.6807524561882019,
"learning_rate": 1.640080767289248e-05,
"loss": 0.0639,
"step": 4490
},
{
"epoch": 2.271580010095911,
"grad_norm": 0.5895000100135803,
"learning_rate": 1.6370519939424534e-05,
"loss": 0.0675,
"step": 4500
},
{
"epoch": 2.276627965673902,
"grad_norm": 0.6640876531600952,
"learning_rate": 1.634023220595659e-05,
"loss": 0.0603,
"step": 4510
},
{
"epoch": 2.281675921251893,
"grad_norm": 0.4367890954017639,
"learning_rate": 1.6309944472488644e-05,
"loss": 0.0517,
"step": 4520
},
{
"epoch": 2.2867238768298837,
"grad_norm": 1.082713007926941,
"learning_rate": 1.6279656739020697e-05,
"loss": 0.0524,
"step": 4530
},
{
"epoch": 2.291771832407875,
"grad_norm": 0.5186300277709961,
"learning_rate": 1.624936900555275e-05,
"loss": 0.0566,
"step": 4540
},
{
"epoch": 2.2968197879858656,
"grad_norm": 1.2778280973434448,
"learning_rate": 1.6219081272084804e-05,
"loss": 0.0531,
"step": 4550
},
{
"epoch": 2.301867743563857,
"grad_norm": 0.46757417917251587,
"learning_rate": 1.618879353861686e-05,
"loss": 0.0637,
"step": 4560
},
{
"epoch": 2.3069156991418476,
"grad_norm": 0.6333388686180115,
"learning_rate": 1.6158505805148914e-05,
"loss": 0.0557,
"step": 4570
},
{
"epoch": 2.3119636547198383,
"grad_norm": 0.4005846381187439,
"learning_rate": 1.6128218071680968e-05,
"loss": 0.0512,
"step": 4580
},
{
"epoch": 2.3170116102978295,
"grad_norm": 1.0479962825775146,
"learning_rate": 1.6097930338213025e-05,
"loss": 0.0639,
"step": 4590
},
{
"epoch": 2.3220595658758203,
"grad_norm": 1.1324669122695923,
"learning_rate": 1.6067642604745078e-05,
"loss": 0.0642,
"step": 4600
},
{
"epoch": 2.327107521453811,
"grad_norm": 0.827215313911438,
"learning_rate": 1.6037354871277135e-05,
"loss": 0.0654,
"step": 4610
},
{
"epoch": 2.3321554770318023,
"grad_norm": 0.8228656649589539,
"learning_rate": 1.600706713780919e-05,
"loss": 0.0648,
"step": 4620
},
{
"epoch": 2.337203432609793,
"grad_norm": 0.5897762775421143,
"learning_rate": 1.5976779404341242e-05,
"loss": 0.0546,
"step": 4630
},
{
"epoch": 2.342251388187784,
"grad_norm": 0.6223641633987427,
"learning_rate": 1.59464916708733e-05,
"loss": 0.0712,
"step": 4640
},
{
"epoch": 2.347299343765775,
"grad_norm": 0.5593187808990479,
"learning_rate": 1.5916203937405352e-05,
"loss": 0.0707,
"step": 4650
},
{
"epoch": 2.3523472993437657,
"grad_norm": 0.9349427223205566,
"learning_rate": 1.5885916203937406e-05,
"loss": 0.0581,
"step": 4660
},
{
"epoch": 2.3573952549217565,
"grad_norm": 0.47101134061813354,
"learning_rate": 1.585562847046946e-05,
"loss": 0.0688,
"step": 4670
},
{
"epoch": 2.3624432104997477,
"grad_norm": 0.5073738098144531,
"learning_rate": 1.5825340737001513e-05,
"loss": 0.0678,
"step": 4680
},
{
"epoch": 2.3674911660777385,
"grad_norm": 0.5324171781539917,
"learning_rate": 1.579505300353357e-05,
"loss": 0.0614,
"step": 4690
},
{
"epoch": 2.3725391216557297,
"grad_norm": 0.662965714931488,
"learning_rate": 1.5764765270065623e-05,
"loss": 0.0507,
"step": 4700
},
{
"epoch": 2.3775870772337204,
"grad_norm": 0.6482782959938049,
"learning_rate": 1.5734477536597676e-05,
"loss": 0.0537,
"step": 4710
},
{
"epoch": 2.382635032811711,
"grad_norm": 1.0039052963256836,
"learning_rate": 1.5704189803129733e-05,
"loss": 0.059,
"step": 4720
},
{
"epoch": 2.3876829883897024,
"grad_norm": 0.8546132445335388,
"learning_rate": 1.5673902069661787e-05,
"loss": 0.0691,
"step": 4730
},
{
"epoch": 2.392730943967693,
"grad_norm": 0.4903261363506317,
"learning_rate": 1.5643614336193843e-05,
"loss": 0.0535,
"step": 4740
},
{
"epoch": 2.397778899545684,
"grad_norm": 0.8538033962249756,
"learning_rate": 1.5613326602725897e-05,
"loss": 0.0616,
"step": 4750
},
{
"epoch": 2.402826855123675,
"grad_norm": 0.7978336215019226,
"learning_rate": 1.558303886925795e-05,
"loss": 0.0613,
"step": 4760
},
{
"epoch": 2.407874810701666,
"grad_norm": 0.6981778740882874,
"learning_rate": 1.5552751135790007e-05,
"loss": 0.0646,
"step": 4770
},
{
"epoch": 2.4129227662796566,
"grad_norm": 0.8517895936965942,
"learning_rate": 1.552246340232206e-05,
"loss": 0.0705,
"step": 4780
},
{
"epoch": 2.417970721857648,
"grad_norm": 0.4087599813938141,
"learning_rate": 1.5492175668854117e-05,
"loss": 0.0638,
"step": 4790
},
{
"epoch": 2.4230186774356386,
"grad_norm": 0.3779948651790619,
"learning_rate": 1.5461887935386168e-05,
"loss": 0.0524,
"step": 4800
},
{
"epoch": 2.4280666330136293,
"grad_norm": 0.42263171076774597,
"learning_rate": 1.543160020191822e-05,
"loss": 0.0623,
"step": 4810
},
{
"epoch": 2.4331145885916206,
"grad_norm": 0.5812351107597351,
"learning_rate": 1.5401312468450278e-05,
"loss": 0.0573,
"step": 4820
},
{
"epoch": 2.4381625441696113,
"grad_norm": 0.6073315143585205,
"learning_rate": 1.537102473498233e-05,
"loss": 0.057,
"step": 4830
},
{
"epoch": 2.443210499747602,
"grad_norm": 0.8706870079040527,
"learning_rate": 1.5340737001514388e-05,
"loss": 0.0606,
"step": 4840
},
{
"epoch": 2.4482584553255933,
"grad_norm": 0.9355966448783875,
"learning_rate": 1.531044926804644e-05,
"loss": 0.0563,
"step": 4850
},
{
"epoch": 2.453306410903584,
"grad_norm": 0.6352431774139404,
"learning_rate": 1.5280161534578495e-05,
"loss": 0.0537,
"step": 4860
},
{
"epoch": 2.458354366481575,
"grad_norm": 0.5970965623855591,
"learning_rate": 1.524987380111055e-05,
"loss": 0.0663,
"step": 4870
},
{
"epoch": 2.463402322059566,
"grad_norm": 0.40907353162765503,
"learning_rate": 1.5219586067642605e-05,
"loss": 0.0502,
"step": 4880
},
{
"epoch": 2.4684502776375568,
"grad_norm": 0.5130166411399841,
"learning_rate": 1.518929833417466e-05,
"loss": 0.0538,
"step": 4890
},
{
"epoch": 2.4734982332155475,
"grad_norm": 0.9824861288070679,
"learning_rate": 1.5159010600706716e-05,
"loss": 0.0518,
"step": 4900
},
{
"epoch": 2.4785461887935387,
"grad_norm": 0.6424157023429871,
"learning_rate": 1.512872286723877e-05,
"loss": 0.0599,
"step": 4910
},
{
"epoch": 2.4835941443715295,
"grad_norm": 0.8797338008880615,
"learning_rate": 1.5098435133770824e-05,
"loss": 0.0534,
"step": 4920
},
{
"epoch": 2.4886420999495202,
"grad_norm": 1.0275185108184814,
"learning_rate": 1.5068147400302876e-05,
"loss": 0.063,
"step": 4930
},
{
"epoch": 2.4936900555275114,
"grad_norm": 0.6370276808738708,
"learning_rate": 1.5037859666834931e-05,
"loss": 0.0584,
"step": 4940
},
{
"epoch": 2.498738011105502,
"grad_norm": 0.5083595514297485,
"learning_rate": 1.5007571933366986e-05,
"loss": 0.0635,
"step": 4950
},
{
"epoch": 2.503785966683493,
"grad_norm": 0.8423396348953247,
"learning_rate": 1.4977284199899041e-05,
"loss": 0.0593,
"step": 4960
},
{
"epoch": 2.508833922261484,
"grad_norm": 0.6133778691291809,
"learning_rate": 1.4946996466431095e-05,
"loss": 0.0652,
"step": 4970
},
{
"epoch": 2.513881877839475,
"grad_norm": 0.5626839995384216,
"learning_rate": 1.491670873296315e-05,
"loss": 0.061,
"step": 4980
},
{
"epoch": 2.5189298334174657,
"grad_norm": 0.6379786729812622,
"learning_rate": 1.4886420999495205e-05,
"loss": 0.0583,
"step": 4990
},
{
"epoch": 2.523977788995457,
"grad_norm": 0.39859360456466675,
"learning_rate": 1.485613326602726e-05,
"loss": 0.057,
"step": 5000
},
{
"epoch": 2.5290257445734476,
"grad_norm": 0.4674101173877716,
"learning_rate": 1.4825845532559315e-05,
"loss": 0.0584,
"step": 5010
},
{
"epoch": 2.5340737001514384,
"grad_norm": 0.6018111705780029,
"learning_rate": 1.4795557799091367e-05,
"loss": 0.0606,
"step": 5020
},
{
"epoch": 2.5391216557294296,
"grad_norm": 0.4932622015476227,
"learning_rate": 1.4765270065623422e-05,
"loss": 0.0551,
"step": 5030
},
{
"epoch": 2.5441696113074204,
"grad_norm": 0.5576731562614441,
"learning_rate": 1.4734982332155477e-05,
"loss": 0.0562,
"step": 5040
},
{
"epoch": 2.5492175668854116,
"grad_norm": 0.5910426378250122,
"learning_rate": 1.4704694598687533e-05,
"loss": 0.0632,
"step": 5050
},
{
"epoch": 2.5542655224634023,
"grad_norm": 0.42830216884613037,
"learning_rate": 1.4674406865219586e-05,
"loss": 0.0589,
"step": 5060
},
{
"epoch": 2.559313478041393,
"grad_norm": 0.657305896282196,
"learning_rate": 1.4644119131751641e-05,
"loss": 0.0666,
"step": 5070
},
{
"epoch": 2.5643614336193843,
"grad_norm": 0.5498583912849426,
"learning_rate": 1.4613831398283696e-05,
"loss": 0.0677,
"step": 5080
},
{
"epoch": 2.569409389197375,
"grad_norm": 1.5641086101531982,
"learning_rate": 1.458354366481575e-05,
"loss": 0.0618,
"step": 5090
},
{
"epoch": 2.5744573447753663,
"grad_norm": 0.576878011226654,
"learning_rate": 1.4553255931347805e-05,
"loss": 0.0596,
"step": 5100
},
{
"epoch": 2.579505300353357,
"grad_norm": 0.6855084896087646,
"learning_rate": 1.4522968197879858e-05,
"loss": 0.0684,
"step": 5110
},
{
"epoch": 2.5845532559313478,
"grad_norm": 0.46760818362236023,
"learning_rate": 1.4492680464411913e-05,
"loss": 0.0628,
"step": 5120
},
{
"epoch": 2.589601211509339,
"grad_norm": 0.4708857834339142,
"learning_rate": 1.4462392730943969e-05,
"loss": 0.0656,
"step": 5130
},
{
"epoch": 2.5946491670873297,
"grad_norm": 0.957336962223053,
"learning_rate": 1.4432104997476024e-05,
"loss": 0.0527,
"step": 5140
},
{
"epoch": 2.5996971226653205,
"grad_norm": 0.6079381704330444,
"learning_rate": 1.4401817264008077e-05,
"loss": 0.0499,
"step": 5150
},
{
"epoch": 2.6047450782433117,
"grad_norm": 0.644965410232544,
"learning_rate": 1.437152953054013e-05,
"loss": 0.0567,
"step": 5160
},
{
"epoch": 2.6097930338213025,
"grad_norm": 0.9058682322502136,
"learning_rate": 1.4341241797072186e-05,
"loss": 0.059,
"step": 5170
},
{
"epoch": 2.614840989399293,
"grad_norm": 0.6784061789512634,
"learning_rate": 1.4310954063604241e-05,
"loss": 0.0577,
"step": 5180
},
{
"epoch": 2.6198889449772844,
"grad_norm": 0.7699759602546692,
"learning_rate": 1.4280666330136296e-05,
"loss": 0.056,
"step": 5190
},
{
"epoch": 2.624936900555275,
"grad_norm": 1.0204094648361206,
"learning_rate": 1.425037859666835e-05,
"loss": 0.0595,
"step": 5200
},
{
"epoch": 2.629984856133266,
"grad_norm": 0.3317660987377167,
"learning_rate": 1.4220090863200403e-05,
"loss": 0.0579,
"step": 5210
},
{
"epoch": 2.635032811711257,
"grad_norm": 0.7586853504180908,
"learning_rate": 1.4189803129732458e-05,
"loss": 0.0612,
"step": 5220
},
{
"epoch": 2.640080767289248,
"grad_norm": 0.43295013904571533,
"learning_rate": 1.4159515396264513e-05,
"loss": 0.0584,
"step": 5230
},
{
"epoch": 2.6451287228672387,
"grad_norm": 0.9083705544471741,
"learning_rate": 1.4129227662796568e-05,
"loss": 0.0698,
"step": 5240
},
{
"epoch": 2.65017667844523,
"grad_norm": 0.6299885511398315,
"learning_rate": 1.4098939929328622e-05,
"loss": 0.0602,
"step": 5250
},
{
"epoch": 2.6552246340232206,
"grad_norm": 0.538589358329773,
"learning_rate": 1.4068652195860677e-05,
"loss": 0.0634,
"step": 5260
},
{
"epoch": 2.6602725896012114,
"grad_norm": 0.5712538361549377,
"learning_rate": 1.4038364462392732e-05,
"loss": 0.0625,
"step": 5270
},
{
"epoch": 2.6653205451792026,
"grad_norm": 0.5739433765411377,
"learning_rate": 1.4008076728924786e-05,
"loss": 0.0647,
"step": 5280
},
{
"epoch": 2.6703685007571933,
"grad_norm": 0.5050386786460876,
"learning_rate": 1.397778899545684e-05,
"loss": 0.0592,
"step": 5290
},
{
"epoch": 2.675416456335184,
"grad_norm": 0.41851407289505005,
"learning_rate": 1.3947501261988894e-05,
"loss": 0.0581,
"step": 5300
},
{
"epoch": 2.6804644119131753,
"grad_norm": 0.5866436958312988,
"learning_rate": 1.391721352852095e-05,
"loss": 0.0656,
"step": 5310
},
{
"epoch": 2.685512367491166,
"grad_norm": 0.47498345375061035,
"learning_rate": 1.3886925795053004e-05,
"loss": 0.0657,
"step": 5320
},
{
"epoch": 2.690560323069157,
"grad_norm": 0.5748500227928162,
"learning_rate": 1.385663806158506e-05,
"loss": 0.0588,
"step": 5330
},
{
"epoch": 2.695608278647148,
"grad_norm": 0.685787558555603,
"learning_rate": 1.3826350328117113e-05,
"loss": 0.0621,
"step": 5340
},
{
"epoch": 2.700656234225139,
"grad_norm": 0.5321753025054932,
"learning_rate": 1.3796062594649166e-05,
"loss": 0.0665,
"step": 5350
},
{
"epoch": 2.7057041898031295,
"grad_norm": 0.4687628746032715,
"learning_rate": 1.3765774861181222e-05,
"loss": 0.0622,
"step": 5360
},
{
"epoch": 2.7107521453811207,
"grad_norm": 0.6931032538414001,
"learning_rate": 1.3735487127713277e-05,
"loss": 0.0542,
"step": 5370
},
{
"epoch": 2.7158001009591115,
"grad_norm": 0.6347541213035583,
"learning_rate": 1.3705199394245332e-05,
"loss": 0.0618,
"step": 5380
},
{
"epoch": 2.7208480565371023,
"grad_norm": 0.5090097188949585,
"learning_rate": 1.3674911660777385e-05,
"loss": 0.0577,
"step": 5390
},
{
"epoch": 2.7258960121150935,
"grad_norm": 0.557161808013916,
"learning_rate": 1.3644623927309439e-05,
"loss": 0.0485,
"step": 5400
},
{
"epoch": 2.7309439676930842,
"grad_norm": 0.7229135036468506,
"learning_rate": 1.3614336193841494e-05,
"loss": 0.0642,
"step": 5410
},
{
"epoch": 2.735991923271075,
"grad_norm": 0.7802084684371948,
"learning_rate": 1.3584048460373549e-05,
"loss": 0.0721,
"step": 5420
},
{
"epoch": 2.741039878849066,
"grad_norm": 0.8350520730018616,
"learning_rate": 1.3553760726905604e-05,
"loss": 0.05,
"step": 5430
},
{
"epoch": 2.746087834427057,
"grad_norm": 0.24809196591377258,
"learning_rate": 1.3523472993437658e-05,
"loss": 0.0577,
"step": 5440
},
{
"epoch": 2.7511357900050477,
"grad_norm": 0.5501554608345032,
"learning_rate": 1.3493185259969713e-05,
"loss": 0.0613,
"step": 5450
},
{
"epoch": 2.756183745583039,
"grad_norm": 0.6459994912147522,
"learning_rate": 1.3462897526501768e-05,
"loss": 0.0545,
"step": 5460
},
{
"epoch": 2.7612317011610297,
"grad_norm": 1.0892735719680786,
"learning_rate": 1.3432609793033821e-05,
"loss": 0.0517,
"step": 5470
},
{
"epoch": 2.7662796567390204,
"grad_norm": 0.8553361296653748,
"learning_rate": 1.3402322059565877e-05,
"loss": 0.055,
"step": 5480
},
{
"epoch": 2.7713276123170116,
"grad_norm": 0.5909534692764282,
"learning_rate": 1.337203432609793e-05,
"loss": 0.0583,
"step": 5490
},
{
"epoch": 2.7763755678950024,
"grad_norm": 0.3620651662349701,
"learning_rate": 1.3341746592629985e-05,
"loss": 0.053,
"step": 5500
},
{
"epoch": 2.7814235234729936,
"grad_norm": 0.6525430083274841,
"learning_rate": 1.331145885916204e-05,
"loss": 0.0667,
"step": 5510
},
{
"epoch": 2.7864714790509844,
"grad_norm": 0.6129066944122314,
"learning_rate": 1.3281171125694095e-05,
"loss": 0.0578,
"step": 5520
},
{
"epoch": 2.791519434628975,
"grad_norm": 0.6374188661575317,
"learning_rate": 1.3250883392226147e-05,
"loss": 0.0598,
"step": 5530
},
{
"epoch": 2.7965673902069663,
"grad_norm": 0.6404274702072144,
"learning_rate": 1.3220595658758202e-05,
"loss": 0.064,
"step": 5540
},
{
"epoch": 2.801615345784957,
"grad_norm": 0.3882500231266022,
"learning_rate": 1.3190307925290257e-05,
"loss": 0.0556,
"step": 5550
},
{
"epoch": 2.8066633013629483,
"grad_norm": 0.827498197555542,
"learning_rate": 1.3160020191822313e-05,
"loss": 0.056,
"step": 5560
},
{
"epoch": 2.811711256940939,
"grad_norm": 0.5474889874458313,
"learning_rate": 1.3129732458354368e-05,
"loss": 0.0559,
"step": 5570
},
{
"epoch": 2.81675921251893,
"grad_norm": 0.7505003809928894,
"learning_rate": 1.3099444724886421e-05,
"loss": 0.0562,
"step": 5580
},
{
"epoch": 2.821807168096921,
"grad_norm": 0.7723977565765381,
"learning_rate": 1.3069156991418476e-05,
"loss": 0.0711,
"step": 5590
},
{
"epoch": 2.8268551236749118,
"grad_norm": 0.5930567979812622,
"learning_rate": 1.303886925795053e-05,
"loss": 0.0666,
"step": 5600
},
{
"epoch": 2.8319030792529025,
"grad_norm": 0.9205801486968994,
"learning_rate": 1.3008581524482585e-05,
"loss": 0.0635,
"step": 5610
},
{
"epoch": 2.8369510348308937,
"grad_norm": 0.6520891189575195,
"learning_rate": 1.297829379101464e-05,
"loss": 0.0503,
"step": 5620
},
{
"epoch": 2.8419989904088845,
"grad_norm": 0.697742760181427,
"learning_rate": 1.2948006057546693e-05,
"loss": 0.0527,
"step": 5630
},
{
"epoch": 2.8470469459868752,
"grad_norm": 0.5600337386131287,
"learning_rate": 1.2917718324078749e-05,
"loss": 0.0658,
"step": 5640
},
{
"epoch": 2.8520949015648664,
"grad_norm": 0.7648780941963196,
"learning_rate": 1.2887430590610804e-05,
"loss": 0.0503,
"step": 5650
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.44580090045928955,
"learning_rate": 1.2857142857142857e-05,
"loss": 0.0569,
"step": 5660
},
{
"epoch": 2.862190812720848,
"grad_norm": 0.6274628043174744,
"learning_rate": 1.2826855123674912e-05,
"loss": 0.0544,
"step": 5670
},
{
"epoch": 2.867238768298839,
"grad_norm": 0.5967713594436646,
"learning_rate": 1.2796567390206966e-05,
"loss": 0.049,
"step": 5680
},
{
"epoch": 2.87228672387683,
"grad_norm": 0.49563518166542053,
"learning_rate": 1.2766279656739021e-05,
"loss": 0.0637,
"step": 5690
},
{
"epoch": 2.8773346794548207,
"grad_norm": 0.5065841674804688,
"learning_rate": 1.2735991923271076e-05,
"loss": 0.0635,
"step": 5700
},
{
"epoch": 2.882382635032812,
"grad_norm": 0.4228837490081787,
"learning_rate": 1.2705704189803131e-05,
"loss": 0.0561,
"step": 5710
},
{
"epoch": 2.8874305906108026,
"grad_norm": 0.36254429817199707,
"learning_rate": 1.2675416456335183e-05,
"loss": 0.0564,
"step": 5720
},
{
"epoch": 2.8924785461887934,
"grad_norm": 0.6964749097824097,
"learning_rate": 1.2645128722867238e-05,
"loss": 0.0566,
"step": 5730
},
{
"epoch": 2.8975265017667846,
"grad_norm": 1.2399131059646606,
"learning_rate": 1.2614840989399293e-05,
"loss": 0.0528,
"step": 5740
},
{
"epoch": 2.9025744573447754,
"grad_norm": 0.45011046528816223,
"learning_rate": 1.2584553255931348e-05,
"loss": 0.0605,
"step": 5750
},
{
"epoch": 2.907622412922766,
"grad_norm": 0.6450422406196594,
"learning_rate": 1.2554265522463404e-05,
"loss": 0.0579,
"step": 5760
},
{
"epoch": 2.9126703685007573,
"grad_norm": 0.6685008406639099,
"learning_rate": 1.2523977788995457e-05,
"loss": 0.0596,
"step": 5770
},
{
"epoch": 2.917718324078748,
"grad_norm": 0.7710725665092468,
"learning_rate": 1.2493690055527512e-05,
"loss": 0.063,
"step": 5780
},
{
"epoch": 2.922766279656739,
"grad_norm": 0.6229269504547119,
"learning_rate": 1.2463402322059566e-05,
"loss": 0.0542,
"step": 5790
},
{
"epoch": 2.92781423523473,
"grad_norm": 0.41364407539367676,
"learning_rate": 1.243311458859162e-05,
"loss": 0.0588,
"step": 5800
},
{
"epoch": 2.932862190812721,
"grad_norm": 0.5546961426734924,
"learning_rate": 1.2402826855123676e-05,
"loss": 0.0607,
"step": 5810
},
{
"epoch": 2.9379101463907116,
"grad_norm": 0.6814476251602173,
"learning_rate": 1.237253912165573e-05,
"loss": 0.0587,
"step": 5820
},
{
"epoch": 2.9429581019687028,
"grad_norm": 0.7745892405509949,
"learning_rate": 1.2342251388187784e-05,
"loss": 0.0484,
"step": 5830
},
{
"epoch": 2.9480060575466935,
"grad_norm": 0.9947149157524109,
"learning_rate": 1.231196365471984e-05,
"loss": 0.056,
"step": 5840
},
{
"epoch": 2.9530540131246843,
"grad_norm": 0.599892258644104,
"learning_rate": 1.2281675921251893e-05,
"loss": 0.0603,
"step": 5850
},
{
"epoch": 2.9581019687026755,
"grad_norm": 0.4991750121116638,
"learning_rate": 1.2251388187783947e-05,
"loss": 0.0603,
"step": 5860
},
{
"epoch": 2.9631499242806663,
"grad_norm": 0.44697603583335876,
"learning_rate": 1.2221100454316002e-05,
"loss": 0.0614,
"step": 5870
},
{
"epoch": 2.968197879858657,
"grad_norm": 0.34608447551727295,
"learning_rate": 1.2190812720848057e-05,
"loss": 0.0633,
"step": 5880
},
{
"epoch": 2.973245835436648,
"grad_norm": 0.6991161108016968,
"learning_rate": 1.2160524987380112e-05,
"loss": 0.0713,
"step": 5890
},
{
"epoch": 2.978293791014639,
"grad_norm": 0.7053156495094299,
"learning_rate": 1.2130237253912167e-05,
"loss": 0.0642,
"step": 5900
},
{
"epoch": 2.9833417465926297,
"grad_norm": 0.4541454315185547,
"learning_rate": 1.209994952044422e-05,
"loss": 0.0583,
"step": 5910
},
{
"epoch": 2.988389702170621,
"grad_norm": 0.5963706970214844,
"learning_rate": 1.2069661786976274e-05,
"loss": 0.0551,
"step": 5920
},
{
"epoch": 2.9934376577486117,
"grad_norm": 0.37611526250839233,
"learning_rate": 1.2039374053508329e-05,
"loss": 0.0551,
"step": 5930
},
{
"epoch": 2.9984856133266025,
"grad_norm": 0.5949448943138123,
"learning_rate": 1.2009086320040384e-05,
"loss": 0.0615,
"step": 5940
},
{
"epoch": 3.0,
"eval_f1": 0.9705180789481339,
"eval_loss": 0.04155249148607254,
"eval_runtime": 582.0561,
"eval_samples_per_second": 354.368,
"eval_steps_per_second": 2.769,
"step": 5943
},
{
"epoch": 3.0035335689045937,
"grad_norm": 0.732612133026123,
"learning_rate": 1.197879858657244e-05,
"loss": 0.0473,
"step": 5950
},
{
"epoch": 3.0085815244825844,
"grad_norm": 0.8803137540817261,
"learning_rate": 1.1948510853104493e-05,
"loss": 0.0513,
"step": 5960
},
{
"epoch": 3.0136294800605756,
"grad_norm": 0.5578094720840454,
"learning_rate": 1.1918223119636548e-05,
"loss": 0.0603,
"step": 5970
},
{
"epoch": 3.0186774356385664,
"grad_norm": 0.9948665499687195,
"learning_rate": 1.1887935386168601e-05,
"loss": 0.0592,
"step": 5980
},
{
"epoch": 3.023725391216557,
"grad_norm": 0.6967259049415588,
"learning_rate": 1.1857647652700657e-05,
"loss": 0.0741,
"step": 5990
},
{
"epoch": 3.0287733467945483,
"grad_norm": 0.48011064529418945,
"learning_rate": 1.182735991923271e-05,
"loss": 0.055,
"step": 6000
},
{
"epoch": 3.033821302372539,
"grad_norm": 0.663847804069519,
"learning_rate": 1.1797072185764765e-05,
"loss": 0.0591,
"step": 6010
},
{
"epoch": 3.03886925795053,
"grad_norm": 0.589154839515686,
"learning_rate": 1.176678445229682e-05,
"loss": 0.0508,
"step": 6020
},
{
"epoch": 3.043917213528521,
"grad_norm": 0.7075181007385254,
"learning_rate": 1.1736496718828875e-05,
"loss": 0.0493,
"step": 6030
},
{
"epoch": 3.048965169106512,
"grad_norm": 0.6230030655860901,
"learning_rate": 1.1706208985360929e-05,
"loss": 0.0589,
"step": 6040
},
{
"epoch": 3.0540131246845026,
"grad_norm": 0.6204888820648193,
"learning_rate": 1.1675921251892982e-05,
"loss": 0.0602,
"step": 6050
},
{
"epoch": 3.059061080262494,
"grad_norm": 0.456939160823822,
"learning_rate": 1.1645633518425038e-05,
"loss": 0.059,
"step": 6060
},
{
"epoch": 3.0641090358404846,
"grad_norm": 0.7607660889625549,
"learning_rate": 1.1615345784957093e-05,
"loss": 0.0488,
"step": 6070
},
{
"epoch": 3.0691569914184753,
"grad_norm": 1.2064040899276733,
"learning_rate": 1.1585058051489148e-05,
"loss": 0.0695,
"step": 6080
},
{
"epoch": 3.0742049469964665,
"grad_norm": 0.5143324732780457,
"learning_rate": 1.1554770318021203e-05,
"loss": 0.0606,
"step": 6090
},
{
"epoch": 3.0792529025744573,
"grad_norm": 0.6567758917808533,
"learning_rate": 1.1524482584553256e-05,
"loss": 0.0581,
"step": 6100
},
{
"epoch": 3.0843008581524485,
"grad_norm": 0.7469787001609802,
"learning_rate": 1.149419485108531e-05,
"loss": 0.0535,
"step": 6110
},
{
"epoch": 3.0893488137304392,
"grad_norm": 0.40161028504371643,
"learning_rate": 1.1463907117617365e-05,
"loss": 0.056,
"step": 6120
},
{
"epoch": 3.09439676930843,
"grad_norm": 0.7404605150222778,
"learning_rate": 1.143361938414942e-05,
"loss": 0.0471,
"step": 6130
},
{
"epoch": 3.099444724886421,
"grad_norm": 0.8587531447410583,
"learning_rate": 1.1403331650681475e-05,
"loss": 0.0558,
"step": 6140
},
{
"epoch": 3.104492680464412,
"grad_norm": 0.424450159072876,
"learning_rate": 1.1373043917213529e-05,
"loss": 0.0558,
"step": 6150
},
{
"epoch": 3.1095406360424027,
"grad_norm": 0.9383788704872131,
"learning_rate": 1.1342756183745584e-05,
"loss": 0.0517,
"step": 6160
},
{
"epoch": 3.114588591620394,
"grad_norm": 0.8069589734077454,
"learning_rate": 1.1312468450277637e-05,
"loss": 0.0588,
"step": 6170
},
{
"epoch": 3.1196365471983847,
"grad_norm": 0.8677689433097839,
"learning_rate": 1.1282180716809692e-05,
"loss": 0.0611,
"step": 6180
},
{
"epoch": 3.1246845027763754,
"grad_norm": 0.7949932813644409,
"learning_rate": 1.1251892983341746e-05,
"loss": 0.0553,
"step": 6190
},
{
"epoch": 3.1297324583543666,
"grad_norm": 0.6563514471054077,
"learning_rate": 1.1221605249873801e-05,
"loss": 0.0549,
"step": 6200
},
{
"epoch": 3.1347804139323574,
"grad_norm": 0.5856168866157532,
"learning_rate": 1.1191317516405856e-05,
"loss": 0.0585,
"step": 6210
},
{
"epoch": 3.139828369510348,
"grad_norm": 0.6840217709541321,
"learning_rate": 1.1161029782937911e-05,
"loss": 0.0683,
"step": 6220
},
{
"epoch": 3.1448763250883394,
"grad_norm": 1.310652494430542,
"learning_rate": 1.1130742049469966e-05,
"loss": 0.057,
"step": 6230
},
{
"epoch": 3.14992428066633,
"grad_norm": 0.6700050830841064,
"learning_rate": 1.1100454316002018e-05,
"loss": 0.0562,
"step": 6240
},
{
"epoch": 3.154972236244321,
"grad_norm": 0.5210493803024292,
"learning_rate": 1.1070166582534073e-05,
"loss": 0.0545,
"step": 6250
},
{
"epoch": 3.160020191822312,
"grad_norm": 0.44693487882614136,
"learning_rate": 1.1039878849066128e-05,
"loss": 0.0614,
"step": 6260
},
{
"epoch": 3.165068147400303,
"grad_norm": 0.8827401995658875,
"learning_rate": 1.1009591115598184e-05,
"loss": 0.06,
"step": 6270
},
{
"epoch": 3.1701161029782936,
"grad_norm": 0.29074421525001526,
"learning_rate": 1.0979303382130239e-05,
"loss": 0.059,
"step": 6280
},
{
"epoch": 3.175164058556285,
"grad_norm": 0.8659618496894836,
"learning_rate": 1.0949015648662292e-05,
"loss": 0.0541,
"step": 6290
},
{
"epoch": 3.1802120141342756,
"grad_norm": 0.8624622821807861,
"learning_rate": 1.0918727915194346e-05,
"loss": 0.0661,
"step": 6300
},
{
"epoch": 3.1852599697122663,
"grad_norm": 0.6411763429641724,
"learning_rate": 1.08884401817264e-05,
"loss": 0.0642,
"step": 6310
},
{
"epoch": 3.1903079252902575,
"grad_norm": 0.5271298289299011,
"learning_rate": 1.0858152448258456e-05,
"loss": 0.0552,
"step": 6320
},
{
"epoch": 3.1953558808682483,
"grad_norm": 0.9701720476150513,
"learning_rate": 1.082786471479051e-05,
"loss": 0.0586,
"step": 6330
},
{
"epoch": 3.200403836446239,
"grad_norm": 0.5633390545845032,
"learning_rate": 1.0797576981322565e-05,
"loss": 0.0554,
"step": 6340
},
{
"epoch": 3.2054517920242303,
"grad_norm": 0.45846840739250183,
"learning_rate": 1.076728924785462e-05,
"loss": 0.0582,
"step": 6350
},
{
"epoch": 3.210499747602221,
"grad_norm": 0.43338650465011597,
"learning_rate": 1.0737001514386673e-05,
"loss": 0.0588,
"step": 6360
},
{
"epoch": 3.215547703180212,
"grad_norm": 0.8287716507911682,
"learning_rate": 1.0706713780918728e-05,
"loss": 0.053,
"step": 6370
},
{
"epoch": 3.220595658758203,
"grad_norm": 0.5174350142478943,
"learning_rate": 1.0676426047450782e-05,
"loss": 0.0587,
"step": 6380
},
{
"epoch": 3.2256436143361937,
"grad_norm": 0.47460228204727173,
"learning_rate": 1.0646138313982837e-05,
"loss": 0.0598,
"step": 6390
},
{
"epoch": 3.230691569914185,
"grad_norm": 0.49122539162635803,
"learning_rate": 1.0615850580514892e-05,
"loss": 0.0535,
"step": 6400
},
{
"epoch": 3.2357395254921757,
"grad_norm": 0.5462148189544678,
"learning_rate": 1.0585562847046947e-05,
"loss": 0.0518,
"step": 6410
},
{
"epoch": 3.2407874810701665,
"grad_norm": 0.7671846747398376,
"learning_rate": 1.0555275113579002e-05,
"loss": 0.0611,
"step": 6420
},
{
"epoch": 3.2458354366481577,
"grad_norm": 0.6748913526535034,
"learning_rate": 1.0524987380111054e-05,
"loss": 0.0561,
"step": 6430
},
{
"epoch": 3.2508833922261484,
"grad_norm": 0.5004613399505615,
"learning_rate": 1.049469964664311e-05,
"loss": 0.0534,
"step": 6440
},
{
"epoch": 3.255931347804139,
"grad_norm": 0.4895551800727844,
"learning_rate": 1.0464411913175164e-05,
"loss": 0.0459,
"step": 6450
},
{
"epoch": 3.2609793033821304,
"grad_norm": 0.47480469942092896,
"learning_rate": 1.043412417970722e-05,
"loss": 0.0601,
"step": 6460
},
{
"epoch": 3.266027258960121,
"grad_norm": 0.4885694086551666,
"learning_rate": 1.0403836446239273e-05,
"loss": 0.0598,
"step": 6470
},
{
"epoch": 3.271075214538112,
"grad_norm": 0.6375486254692078,
"learning_rate": 1.0373548712771328e-05,
"loss": 0.0602,
"step": 6480
},
{
"epoch": 3.276123170116103,
"grad_norm": 0.7264606356620789,
"learning_rate": 1.0343260979303382e-05,
"loss": 0.0579,
"step": 6490
},
{
"epoch": 3.281171125694094,
"grad_norm": 0.5704456567764282,
"learning_rate": 1.0312973245835437e-05,
"loss": 0.056,
"step": 6500
},
{
"epoch": 3.2862190812720846,
"grad_norm": 0.6324512362480164,
"learning_rate": 1.0282685512367492e-05,
"loss": 0.0515,
"step": 6510
},
{
"epoch": 3.291267036850076,
"grad_norm": 0.5736483931541443,
"learning_rate": 1.0252397778899545e-05,
"loss": 0.0538,
"step": 6520
},
{
"epoch": 3.2963149924280666,
"grad_norm": 0.48032522201538086,
"learning_rate": 1.02221100454316e-05,
"loss": 0.0568,
"step": 6530
},
{
"epoch": 3.301362948006058,
"grad_norm": 0.6696997880935669,
"learning_rate": 1.0191822311963656e-05,
"loss": 0.0537,
"step": 6540
},
{
"epoch": 3.3064109035840485,
"grad_norm": 0.44333356618881226,
"learning_rate": 1.016153457849571e-05,
"loss": 0.0514,
"step": 6550
},
{
"epoch": 3.3114588591620393,
"grad_norm": 0.6224443912506104,
"learning_rate": 1.0131246845027764e-05,
"loss": 0.0607,
"step": 6560
},
{
"epoch": 3.3165068147400305,
"grad_norm": 0.7066437602043152,
"learning_rate": 1.0100959111559818e-05,
"loss": 0.0563,
"step": 6570
},
{
"epoch": 3.3215547703180213,
"grad_norm": 0.6406083106994629,
"learning_rate": 1.0070671378091873e-05,
"loss": 0.0573,
"step": 6580
},
{
"epoch": 3.326602725896012,
"grad_norm": 0.44534462690353394,
"learning_rate": 1.0040383644623928e-05,
"loss": 0.059,
"step": 6590
},
{
"epoch": 3.3316506814740032,
"grad_norm": 0.7137624025344849,
"learning_rate": 1.0010095911155983e-05,
"loss": 0.0568,
"step": 6600
},
{
"epoch": 3.336698637051994,
"grad_norm": 0.6909269690513611,
"learning_rate": 9.979808177688038e-06,
"loss": 0.0493,
"step": 6610
},
{
"epoch": 3.3417465926299847,
"grad_norm": 0.6987153887748718,
"learning_rate": 9.94952044422009e-06,
"loss": 0.059,
"step": 6620
},
{
"epoch": 3.346794548207976,
"grad_norm": 0.538732647895813,
"learning_rate": 9.919232710752145e-06,
"loss": 0.0582,
"step": 6630
},
{
"epoch": 3.3518425037859667,
"grad_norm": 0.6330693960189819,
"learning_rate": 9.8889449772842e-06,
"loss": 0.0506,
"step": 6640
},
{
"epoch": 3.3568904593639575,
"grad_norm": 0.5216783881187439,
"learning_rate": 9.858657243816255e-06,
"loss": 0.0544,
"step": 6650
},
{
"epoch": 3.3619384149419487,
"grad_norm": 0.7052462697029114,
"learning_rate": 9.828369510348309e-06,
"loss": 0.0553,
"step": 6660
},
{
"epoch": 3.3669863705199394,
"grad_norm": 0.7679615616798401,
"learning_rate": 9.798081776880364e-06,
"loss": 0.061,
"step": 6670
},
{
"epoch": 3.37203432609793,
"grad_norm": 0.530564546585083,
"learning_rate": 9.767794043412417e-06,
"loss": 0.0567,
"step": 6680
},
{
"epoch": 3.3770822816759214,
"grad_norm": 0.6907301545143127,
"learning_rate": 9.737506309944473e-06,
"loss": 0.0561,
"step": 6690
},
{
"epoch": 3.382130237253912,
"grad_norm": 0.7837420105934143,
"learning_rate": 9.707218576476528e-06,
"loss": 0.0618,
"step": 6700
},
{
"epoch": 3.387178192831903,
"grad_norm": 0.6361984014511108,
"learning_rate": 9.676930843008581e-06,
"loss": 0.0533,
"step": 6710
},
{
"epoch": 3.392226148409894,
"grad_norm": 0.6775834560394287,
"learning_rate": 9.646643109540636e-06,
"loss": 0.0571,
"step": 6720
},
{
"epoch": 3.397274103987885,
"grad_norm": 0.4820801615715027,
"learning_rate": 9.616355376072691e-06,
"loss": 0.063,
"step": 6730
},
{
"epoch": 3.4023220595658756,
"grad_norm": 0.511091411113739,
"learning_rate": 9.586067642604747e-06,
"loss": 0.0621,
"step": 6740
},
{
"epoch": 3.407370015143867,
"grad_norm": 0.5163900852203369,
"learning_rate": 9.5557799091368e-06,
"loss": 0.0606,
"step": 6750
},
{
"epoch": 3.4124179707218576,
"grad_norm": 0.4652441740036011,
"learning_rate": 9.525492175668853e-06,
"loss": 0.0539,
"step": 6760
},
{
"epoch": 3.4174659262998484,
"grad_norm": 0.5968872904777527,
"learning_rate": 9.495204442200909e-06,
"loss": 0.0599,
"step": 6770
},
{
"epoch": 3.4225138818778396,
"grad_norm": 0.4634818732738495,
"learning_rate": 9.464916708732964e-06,
"loss": 0.0518,
"step": 6780
},
{
"epoch": 3.4275618374558303,
"grad_norm": 0.34169018268585205,
"learning_rate": 9.434628975265019e-06,
"loss": 0.0588,
"step": 6790
},
{
"epoch": 3.432609793033821,
"grad_norm": 0.719494640827179,
"learning_rate": 9.404341241797072e-06,
"loss": 0.0538,
"step": 6800
},
{
"epoch": 3.4376577486118123,
"grad_norm": 0.4465346336364746,
"learning_rate": 9.374053508329126e-06,
"loss": 0.0577,
"step": 6810
},
{
"epoch": 3.442705704189803,
"grad_norm": 0.6223052740097046,
"learning_rate": 9.343765774861181e-06,
"loss": 0.0598,
"step": 6820
},
{
"epoch": 3.447753659767794,
"grad_norm": 0.6854692697525024,
"learning_rate": 9.313478041393236e-06,
"loss": 0.0544,
"step": 6830
},
{
"epoch": 3.452801615345785,
"grad_norm": 1.0640225410461426,
"learning_rate": 9.283190307925291e-06,
"loss": 0.0569,
"step": 6840
},
{
"epoch": 3.4578495709237758,
"grad_norm": 0.5437650680541992,
"learning_rate": 9.252902574457345e-06,
"loss": 0.0612,
"step": 6850
},
{
"epoch": 3.462897526501767,
"grad_norm": 0.5767130255699158,
"learning_rate": 9.2226148409894e-06,
"loss": 0.0618,
"step": 6860
},
{
"epoch": 3.4679454820797577,
"grad_norm": 0.5814956426620483,
"learning_rate": 9.192327107521453e-06,
"loss": 0.0571,
"step": 6870
},
{
"epoch": 3.4729934376577485,
"grad_norm": 0.31469887495040894,
"learning_rate": 9.162039374053508e-06,
"loss": 0.0573,
"step": 6880
},
{
"epoch": 3.4780413932357397,
"grad_norm": 0.3987484872341156,
"learning_rate": 9.131751640585563e-06,
"loss": 0.0534,
"step": 6890
},
{
"epoch": 3.4830893488137304,
"grad_norm": 0.47312065958976746,
"learning_rate": 9.101463907117617e-06,
"loss": 0.0608,
"step": 6900
},
{
"epoch": 3.488137304391721,
"grad_norm": 0.4635220170021057,
"learning_rate": 9.071176173649672e-06,
"loss": 0.05,
"step": 6910
},
{
"epoch": 3.4931852599697124,
"grad_norm": 1.146721363067627,
"learning_rate": 9.040888440181727e-06,
"loss": 0.0548,
"step": 6920
},
{
"epoch": 3.498233215547703,
"grad_norm": 0.42057961225509644,
"learning_rate": 9.010600706713782e-06,
"loss": 0.0463,
"step": 6930
},
{
"epoch": 3.5032811711256944,
"grad_norm": 0.7835047841072083,
"learning_rate": 8.980312973245836e-06,
"loss": 0.0507,
"step": 6940
},
{
"epoch": 3.508329126703685,
"grad_norm": 0.6441161036491394,
"learning_rate": 8.95002523977789e-06,
"loss": 0.0571,
"step": 6950
},
{
"epoch": 3.513377082281676,
"grad_norm": 0.6828143000602722,
"learning_rate": 8.919737506309944e-06,
"loss": 0.0525,
"step": 6960
},
{
"epoch": 3.518425037859667,
"grad_norm": 0.8285954594612122,
"learning_rate": 8.889449772842e-06,
"loss": 0.0621,
"step": 6970
},
{
"epoch": 3.523472993437658,
"grad_norm": 0.4954177439212799,
"learning_rate": 8.859162039374055e-06,
"loss": 0.0625,
"step": 6980
},
{
"epoch": 3.5285209490156486,
"grad_norm": 0.7900820374488831,
"learning_rate": 8.828874305906108e-06,
"loss": 0.0603,
"step": 6990
},
{
"epoch": 3.53356890459364,
"grad_norm": 0.6767242550849915,
"learning_rate": 8.798586572438162e-06,
"loss": 0.0586,
"step": 7000
},
{
"epoch": 3.5386168601716306,
"grad_norm": 0.5408624410629272,
"learning_rate": 8.768298838970217e-06,
"loss": 0.0561,
"step": 7010
},
{
"epoch": 3.5436648157496213,
"grad_norm": 0.4577973484992981,
"learning_rate": 8.738011105502272e-06,
"loss": 0.057,
"step": 7020
},
{
"epoch": 3.5487127713276125,
"grad_norm": 0.7334242463111877,
"learning_rate": 8.707723372034327e-06,
"loss": 0.0602,
"step": 7030
},
{
"epoch": 3.5537607269056033,
"grad_norm": 0.5569146275520325,
"learning_rate": 8.67743563856638e-06,
"loss": 0.0564,
"step": 7040
},
{
"epoch": 3.558808682483594,
"grad_norm": 0.5739743709564209,
"learning_rate": 8.647147905098436e-06,
"loss": 0.0605,
"step": 7050
},
{
"epoch": 3.5638566380615853,
"grad_norm": 0.5553867816925049,
"learning_rate": 8.61686017163049e-06,
"loss": 0.0573,
"step": 7060
},
{
"epoch": 3.568904593639576,
"grad_norm": 0.7109550833702087,
"learning_rate": 8.586572438162544e-06,
"loss": 0.0634,
"step": 7070
},
{
"epoch": 3.5739525492175668,
"grad_norm": 0.46534502506256104,
"learning_rate": 8.5562847046946e-06,
"loss": 0.0494,
"step": 7080
},
{
"epoch": 3.579000504795558,
"grad_norm": 0.47850191593170166,
"learning_rate": 8.525996971226653e-06,
"loss": 0.0613,
"step": 7090
},
{
"epoch": 3.5840484603735487,
"grad_norm": 0.3749614953994751,
"learning_rate": 8.495709237758708e-06,
"loss": 0.0574,
"step": 7100
},
{
"epoch": 3.5890964159515395,
"grad_norm": 0.5852258801460266,
"learning_rate": 8.465421504290763e-06,
"loss": 0.064,
"step": 7110
},
{
"epoch": 3.5941443715295307,
"grad_norm": 0.3820860981941223,
"learning_rate": 8.435133770822818e-06,
"loss": 0.0559,
"step": 7120
},
{
"epoch": 3.5991923271075215,
"grad_norm": 0.5200080275535583,
"learning_rate": 8.40484603735487e-06,
"loss": 0.0556,
"step": 7130
},
{
"epoch": 3.604240282685512,
"grad_norm": 0.6472256183624268,
"learning_rate": 8.374558303886925e-06,
"loss": 0.0596,
"step": 7140
},
{
"epoch": 3.6092882382635034,
"grad_norm": 0.43182119727134705,
"learning_rate": 8.34427057041898e-06,
"loss": 0.0478,
"step": 7150
},
{
"epoch": 3.614336193841494,
"grad_norm": 0.6659020781517029,
"learning_rate": 8.313982836951035e-06,
"loss": 0.054,
"step": 7160
},
{
"epoch": 3.619384149419485,
"grad_norm": 0.6561934947967529,
"learning_rate": 8.28369510348309e-06,
"loss": 0.0583,
"step": 7170
},
{
"epoch": 3.624432104997476,
"grad_norm": 0.7083423733711243,
"learning_rate": 8.253407370015144e-06,
"loss": 0.0598,
"step": 7180
},
{
"epoch": 3.629480060575467,
"grad_norm": 0.6030146479606628,
"learning_rate": 8.223119636547197e-06,
"loss": 0.0569,
"step": 7190
},
{
"epoch": 3.6345280161534577,
"grad_norm": 0.4650856554508209,
"learning_rate": 8.192831903079253e-06,
"loss": 0.0593,
"step": 7200
},
{
"epoch": 3.639575971731449,
"grad_norm": 0.5656235814094543,
"learning_rate": 8.162544169611308e-06,
"loss": 0.058,
"step": 7210
},
{
"epoch": 3.6446239273094396,
"grad_norm": 0.5745735764503479,
"learning_rate": 8.132256436143363e-06,
"loss": 0.0582,
"step": 7220
},
{
"epoch": 3.6496718828874304,
"grad_norm": 0.7879515886306763,
"learning_rate": 8.101968702675416e-06,
"loss": 0.0593,
"step": 7230
},
{
"epoch": 3.6547198384654216,
"grad_norm": 0.7000477313995361,
"learning_rate": 8.071680969207471e-06,
"loss": 0.0517,
"step": 7240
},
{
"epoch": 3.6597677940434123,
"grad_norm": 0.44397464394569397,
"learning_rate": 8.041393235739527e-06,
"loss": 0.0569,
"step": 7250
},
{
"epoch": 3.664815749621403,
"grad_norm": 0.55961674451828,
"learning_rate": 8.01110550227158e-06,
"loss": 0.0529,
"step": 7260
},
{
"epoch": 3.6698637051993943,
"grad_norm": 0.5441805720329285,
"learning_rate": 7.980817768803635e-06,
"loss": 0.0537,
"step": 7270
},
{
"epoch": 3.674911660777385,
"grad_norm": 0.5779780149459839,
"learning_rate": 7.950530035335689e-06,
"loss": 0.0549,
"step": 7280
},
{
"epoch": 3.679959616355376,
"grad_norm": 0.4491129517555237,
"learning_rate": 7.920242301867744e-06,
"loss": 0.0527,
"step": 7290
},
{
"epoch": 3.685007571933367,
"grad_norm": 0.6601787209510803,
"learning_rate": 7.889954568399799e-06,
"loss": 0.0545,
"step": 7300
},
{
"epoch": 3.690055527511358,
"grad_norm": 0.7920609712600708,
"learning_rate": 7.859666834931854e-06,
"loss": 0.0607,
"step": 7310
},
{
"epoch": 3.6951034830893486,
"grad_norm": 0.6220458149909973,
"learning_rate": 7.829379101463906e-06,
"loss": 0.0574,
"step": 7320
},
{
"epoch": 3.7001514386673398,
"grad_norm": 0.6900739669799805,
"learning_rate": 7.799091367995961e-06,
"loss": 0.0549,
"step": 7330
},
{
"epoch": 3.7051993942453305,
"grad_norm": 1.071191430091858,
"learning_rate": 7.768803634528016e-06,
"loss": 0.0644,
"step": 7340
},
{
"epoch": 3.7102473498233217,
"grad_norm": 0.5342854261398315,
"learning_rate": 7.738515901060071e-06,
"loss": 0.0639,
"step": 7350
},
{
"epoch": 3.7152953054013125,
"grad_norm": 0.49695709347724915,
"learning_rate": 7.708228167592126e-06,
"loss": 0.0525,
"step": 7360
},
{
"epoch": 3.7203432609793032,
"grad_norm": 0.6041547060012817,
"learning_rate": 7.67794043412418e-06,
"loss": 0.0596,
"step": 7370
},
{
"epoch": 3.7253912165572944,
"grad_norm": 0.6425964832305908,
"learning_rate": 7.647652700656235e-06,
"loss": 0.0626,
"step": 7380
},
{
"epoch": 3.730439172135285,
"grad_norm": 0.5185597538948059,
"learning_rate": 7.617364967188288e-06,
"loss": 0.063,
"step": 7390
},
{
"epoch": 3.7354871277132764,
"grad_norm": 0.48031681776046753,
"learning_rate": 7.587077233720343e-06,
"loss": 0.0633,
"step": 7400
},
{
"epoch": 3.740535083291267,
"grad_norm": 0.46377626061439514,
"learning_rate": 7.556789500252398e-06,
"loss": 0.0581,
"step": 7410
},
{
"epoch": 3.745583038869258,
"grad_norm": 0.7336452007293701,
"learning_rate": 7.526501766784453e-06,
"loss": 0.0572,
"step": 7420
},
{
"epoch": 3.750630994447249,
"grad_norm": 0.8720684051513672,
"learning_rate": 7.4962140333165064e-06,
"loss": 0.0558,
"step": 7430
},
{
"epoch": 3.75567895002524,
"grad_norm": 0.372592031955719,
"learning_rate": 7.465926299848562e-06,
"loss": 0.0613,
"step": 7440
},
{
"epoch": 3.7607269056032306,
"grad_norm": 0.5049020648002625,
"learning_rate": 7.435638566380616e-06,
"loss": 0.058,
"step": 7450
},
{
"epoch": 3.765774861181222,
"grad_norm": 0.5402325391769409,
"learning_rate": 7.405350832912671e-06,
"loss": 0.0484,
"step": 7460
},
{
"epoch": 3.7708228167592126,
"grad_norm": 0.5662652850151062,
"learning_rate": 7.375063099444725e-06,
"loss": 0.0613,
"step": 7470
},
{
"epoch": 3.7758707723372034,
"grad_norm": 0.6431825160980225,
"learning_rate": 7.34477536597678e-06,
"loss": 0.0522,
"step": 7480
},
{
"epoch": 3.7809187279151946,
"grad_norm": 0.9309275150299072,
"learning_rate": 7.314487632508835e-06,
"loss": 0.0602,
"step": 7490
},
{
"epoch": 3.7859666834931853,
"grad_norm": 0.801145076751709,
"learning_rate": 7.284199899040888e-06,
"loss": 0.0581,
"step": 7500
},
{
"epoch": 3.791014639071176,
"grad_norm": 0.5122712850570679,
"learning_rate": 7.253912165572943e-06,
"loss": 0.0552,
"step": 7510
},
{
"epoch": 3.7960625946491673,
"grad_norm": 0.39402052760124207,
"learning_rate": 7.223624432104998e-06,
"loss": 0.0552,
"step": 7520
},
{
"epoch": 3.801110550227158,
"grad_norm": 0.5302004814147949,
"learning_rate": 7.193336698637052e-06,
"loss": 0.0626,
"step": 7530
},
{
"epoch": 3.806158505805149,
"grad_norm": 0.4123098850250244,
"learning_rate": 7.163048965169107e-06,
"loss": 0.0569,
"step": 7540
},
{
"epoch": 3.81120646138314,
"grad_norm": 0.8736279010772705,
"learning_rate": 7.132761231701161e-06,
"loss": 0.0537,
"step": 7550
},
{
"epoch": 3.8162544169611308,
"grad_norm": 0.4374080002307892,
"learning_rate": 7.102473498233216e-06,
"loss": 0.057,
"step": 7560
},
{
"epoch": 3.8213023725391215,
"grad_norm": 0.863776445388794,
"learning_rate": 7.07218576476527e-06,
"loss": 0.049,
"step": 7570
},
{
"epoch": 3.8263503281171127,
"grad_norm": 0.5356324315071106,
"learning_rate": 7.041898031297325e-06,
"loss": 0.0578,
"step": 7580
},
{
"epoch": 3.8313982836951035,
"grad_norm": 0.5422727465629578,
"learning_rate": 7.0116102978293786e-06,
"loss": 0.0577,
"step": 7590
},
{
"epoch": 3.8364462392730942,
"grad_norm": 0.6234108805656433,
"learning_rate": 6.981322564361434e-06,
"loss": 0.0573,
"step": 7600
},
{
"epoch": 3.8414941948510855,
"grad_norm": 0.9067860841751099,
"learning_rate": 6.951034830893489e-06,
"loss": 0.0471,
"step": 7610
},
{
"epoch": 3.846542150429076,
"grad_norm": 0.5522469878196716,
"learning_rate": 6.920747097425543e-06,
"loss": 0.053,
"step": 7620
},
{
"epoch": 3.851590106007067,
"grad_norm": 0.7358270287513733,
"learning_rate": 6.8904593639575974e-06,
"loss": 0.0561,
"step": 7630
},
{
"epoch": 3.856638061585058,
"grad_norm": 0.5285794138908386,
"learning_rate": 6.860171630489652e-06,
"loss": 0.0618,
"step": 7640
},
{
"epoch": 3.861686017163049,
"grad_norm": 0.6937068700790405,
"learning_rate": 6.829883897021707e-06,
"loss": 0.059,
"step": 7650
},
{
"epoch": 3.8667339727410397,
"grad_norm": 0.6941738724708557,
"learning_rate": 6.79959616355376e-06,
"loss": 0.0515,
"step": 7660
},
{
"epoch": 3.871781928319031,
"grad_norm": 0.8964054584503174,
"learning_rate": 6.7693084300858155e-06,
"loss": 0.0526,
"step": 7670
},
{
"epoch": 3.8768298838970217,
"grad_norm": 0.5919986367225647,
"learning_rate": 6.739020696617871e-06,
"loss": 0.0577,
"step": 7680
},
{
"epoch": 3.8818778394750124,
"grad_norm": 0.4616561532020569,
"learning_rate": 6.708732963149924e-06,
"loss": 0.0509,
"step": 7690
},
{
"epoch": 3.8869257950530036,
"grad_norm": 0.6349731087684631,
"learning_rate": 6.678445229681979e-06,
"loss": 0.0535,
"step": 7700
},
{
"epoch": 3.8919737506309944,
"grad_norm": 0.6474828720092773,
"learning_rate": 6.6481574962140335e-06,
"loss": 0.0552,
"step": 7710
},
{
"epoch": 3.897021706208985,
"grad_norm": 0.5433930158615112,
"learning_rate": 6.617869762746088e-06,
"loss": 0.062,
"step": 7720
},
{
"epoch": 3.9020696617869763,
"grad_norm": 0.6113614439964294,
"learning_rate": 6.587582029278142e-06,
"loss": 0.06,
"step": 7730
},
{
"epoch": 3.907117617364967,
"grad_norm": 0.8800488114356995,
"learning_rate": 6.557294295810197e-06,
"loss": 0.0578,
"step": 7740
},
{
"epoch": 3.912165572942958,
"grad_norm": 0.5158660411834717,
"learning_rate": 6.5270065623422515e-06,
"loss": 0.0524,
"step": 7750
},
{
"epoch": 3.917213528520949,
"grad_norm": 0.5676606297492981,
"learning_rate": 6.496718828874306e-06,
"loss": 0.0474,
"step": 7760
},
{
"epoch": 3.92226148409894,
"grad_norm": 0.6438203454017639,
"learning_rate": 6.466431095406361e-06,
"loss": 0.0587,
"step": 7770
},
{
"epoch": 3.9273094396769306,
"grad_norm": 0.6570119857788086,
"learning_rate": 6.436143361938415e-06,
"loss": 0.0489,
"step": 7780
},
{
"epoch": 3.932357395254922,
"grad_norm": 0.5620145201683044,
"learning_rate": 6.4058556284704695e-06,
"loss": 0.0559,
"step": 7790
},
{
"epoch": 3.9374053508329125,
"grad_norm": 0.6886317729949951,
"learning_rate": 6.375567895002524e-06,
"loss": 0.0581,
"step": 7800
},
{
"epoch": 3.9424533064109037,
"grad_norm": 0.7463077306747437,
"learning_rate": 6.345280161534579e-06,
"loss": 0.0477,
"step": 7810
},
{
"epoch": 3.9475012619888945,
"grad_norm": 0.5246394276618958,
"learning_rate": 6.314992428066633e-06,
"loss": 0.0501,
"step": 7820
},
{
"epoch": 3.9525492175668853,
"grad_norm": 0.5147930979728699,
"learning_rate": 6.2847046945986876e-06,
"loss": 0.0603,
"step": 7830
},
{
"epoch": 3.9575971731448765,
"grad_norm": 0.3963003158569336,
"learning_rate": 6.254416961130743e-06,
"loss": 0.059,
"step": 7840
},
{
"epoch": 3.9626451287228672,
"grad_norm": 0.7148598432540894,
"learning_rate": 6.224129227662796e-06,
"loss": 0.0524,
"step": 7850
},
{
"epoch": 3.967693084300858,
"grad_norm": 0.5985211133956909,
"learning_rate": 6.193841494194851e-06,
"loss": 0.0609,
"step": 7860
},
{
"epoch": 3.972741039878849,
"grad_norm": 0.6152123808860779,
"learning_rate": 6.163553760726906e-06,
"loss": 0.0622,
"step": 7870
},
{
"epoch": 3.97778899545684,
"grad_norm": 0.49580270051956177,
"learning_rate": 6.13326602725896e-06,
"loss": 0.056,
"step": 7880
},
{
"epoch": 3.982836951034831,
"grad_norm": 0.8874292373657227,
"learning_rate": 6.102978293791015e-06,
"loss": 0.0599,
"step": 7890
},
{
"epoch": 3.987884906612822,
"grad_norm": 0.6198350787162781,
"learning_rate": 6.072690560323069e-06,
"loss": 0.0546,
"step": 7900
},
{
"epoch": 3.9929328621908127,
"grad_norm": 0.39257192611694336,
"learning_rate": 6.042402826855124e-06,
"loss": 0.0523,
"step": 7910
},
{
"epoch": 3.997980817768804,
"grad_norm": 0.4612904191017151,
"learning_rate": 6.012115093387178e-06,
"loss": 0.0685,
"step": 7920
},
{
"epoch": 4.0,
"eval_f1": 0.9705180789481339,
"eval_loss": 0.038467586040496826,
"eval_runtime": 578.9562,
"eval_samples_per_second": 356.265,
"eval_steps_per_second": 2.784,
"step": 7924
},
{
"epoch": 4.003028773346794,
"grad_norm": 0.7146291732788086,
"learning_rate": 5.981827359919233e-06,
"loss": 0.0575,
"step": 7930
},
{
"epoch": 4.008076728924785,
"grad_norm": 0.6313480138778687,
"learning_rate": 5.951539626451287e-06,
"loss": 0.0581,
"step": 7940
},
{
"epoch": 4.013124684502777,
"grad_norm": 0.4977870583534241,
"learning_rate": 5.921251892983342e-06,
"loss": 0.0582,
"step": 7950
},
{
"epoch": 4.018172640080767,
"grad_norm": 0.4447147250175476,
"learning_rate": 5.890964159515397e-06,
"loss": 0.0544,
"step": 7960
},
{
"epoch": 4.023220595658758,
"grad_norm": 0.6496310234069824,
"learning_rate": 5.860676426047451e-06,
"loss": 0.0595,
"step": 7970
},
{
"epoch": 4.028268551236749,
"grad_norm": 0.4380001127719879,
"learning_rate": 5.830388692579505e-06,
"loss": 0.0549,
"step": 7980
},
{
"epoch": 4.03331650681474,
"grad_norm": 0.5718368887901306,
"learning_rate": 5.80010095911156e-06,
"loss": 0.0559,
"step": 7990
},
{
"epoch": 4.038364462392731,
"grad_norm": 0.5859358906745911,
"learning_rate": 5.769813225643615e-06,
"loss": 0.0572,
"step": 8000
},
{
"epoch": 4.043412417970722,
"grad_norm": 0.49378788471221924,
"learning_rate": 5.739525492175669e-06,
"loss": 0.054,
"step": 8010
},
{
"epoch": 4.048460373548712,
"grad_norm": 0.6780097484588623,
"learning_rate": 5.709237758707723e-06,
"loss": 0.0568,
"step": 8020
},
{
"epoch": 4.053508329126704,
"grad_norm": 0.8048389554023743,
"learning_rate": 5.6789500252397786e-06,
"loss": 0.0527,
"step": 8030
},
{
"epoch": 4.058556284704695,
"grad_norm": 0.4513346254825592,
"learning_rate": 5.648662291771832e-06,
"loss": 0.0597,
"step": 8040
},
{
"epoch": 4.063604240282685,
"grad_norm": 0.6877405643463135,
"learning_rate": 5.618374558303887e-06,
"loss": 0.0594,
"step": 8050
},
{
"epoch": 4.068652195860676,
"grad_norm": 0.41468387842178345,
"learning_rate": 5.5880868248359414e-06,
"loss": 0.0563,
"step": 8060
},
{
"epoch": 4.0737001514386675,
"grad_norm": 0.5062978267669678,
"learning_rate": 5.557799091367996e-06,
"loss": 0.0598,
"step": 8070
},
{
"epoch": 4.078748107016659,
"grad_norm": 0.6427041888237,
"learning_rate": 5.527511357900051e-06,
"loss": 0.057,
"step": 8080
},
{
"epoch": 4.083796062594649,
"grad_norm": 0.5508936643600464,
"learning_rate": 5.497223624432105e-06,
"loss": 0.0472,
"step": 8090
},
{
"epoch": 4.08884401817264,
"grad_norm": 0.39490872621536255,
"learning_rate": 5.4669358909641595e-06,
"loss": 0.0589,
"step": 8100
},
{
"epoch": 4.093891973750631,
"grad_norm": 0.5776220560073853,
"learning_rate": 5.436648157496214e-06,
"loss": 0.0602,
"step": 8110
},
{
"epoch": 4.098939929328622,
"grad_norm": 0.36714500188827515,
"learning_rate": 5.406360424028269e-06,
"loss": 0.0474,
"step": 8120
},
{
"epoch": 4.103987884906613,
"grad_norm": 0.7429747581481934,
"learning_rate": 5.376072690560323e-06,
"loss": 0.0516,
"step": 8130
},
{
"epoch": 4.109035840484604,
"grad_norm": 0.7167190909385681,
"learning_rate": 5.3457849570923775e-06,
"loss": 0.0559,
"step": 8140
},
{
"epoch": 4.1140837960625944,
"grad_norm": 0.5668296217918396,
"learning_rate": 5.315497223624433e-06,
"loss": 0.0558,
"step": 8150
},
{
"epoch": 4.119131751640586,
"grad_norm": 0.5577311515808105,
"learning_rate": 5.285209490156487e-06,
"loss": 0.0589,
"step": 8160
},
{
"epoch": 4.124179707218577,
"grad_norm": 0.611304759979248,
"learning_rate": 5.254921756688541e-06,
"loss": 0.0546,
"step": 8170
},
{
"epoch": 4.129227662796567,
"grad_norm": 0.5540894865989685,
"learning_rate": 5.2246340232205955e-06,
"loss": 0.0611,
"step": 8180
},
{
"epoch": 4.134275618374558,
"grad_norm": 0.5128312706947327,
"learning_rate": 5.194346289752651e-06,
"loss": 0.0552,
"step": 8190
},
{
"epoch": 4.13932357395255,
"grad_norm": 0.6017599105834961,
"learning_rate": 5.164058556284704e-06,
"loss": 0.0494,
"step": 8200
},
{
"epoch": 4.14437152953054,
"grad_norm": 0.42843466997146606,
"learning_rate": 5.133770822816759e-06,
"loss": 0.0534,
"step": 8210
},
{
"epoch": 4.149419485108531,
"grad_norm": 0.6050401926040649,
"learning_rate": 5.103483089348814e-06,
"loss": 0.0524,
"step": 8220
},
{
"epoch": 4.154467440686522,
"grad_norm": 0.512793242931366,
"learning_rate": 5.073195355880868e-06,
"loss": 0.0562,
"step": 8230
},
{
"epoch": 4.159515396264513,
"grad_norm": 0.5130860209465027,
"learning_rate": 5.042907622412923e-06,
"loss": 0.0413,
"step": 8240
},
{
"epoch": 4.164563351842504,
"grad_norm": 0.6443082690238953,
"learning_rate": 5.012619888944977e-06,
"loss": 0.0593,
"step": 8250
},
{
"epoch": 4.169611307420495,
"grad_norm": 0.6051344871520996,
"learning_rate": 4.982332155477032e-06,
"loss": 0.0542,
"step": 8260
},
{
"epoch": 4.174659262998485,
"grad_norm": 0.5795598030090332,
"learning_rate": 4.952044422009086e-06,
"loss": 0.0569,
"step": 8270
},
{
"epoch": 4.1797072185764765,
"grad_norm": 0.6054142117500305,
"learning_rate": 4.921756688541141e-06,
"loss": 0.0575,
"step": 8280
},
{
"epoch": 4.184755174154468,
"grad_norm": 0.6954050660133362,
"learning_rate": 4.891468955073196e-06,
"loss": 0.0609,
"step": 8290
},
{
"epoch": 4.189803129732458,
"grad_norm": 0.7217870354652405,
"learning_rate": 4.86118122160525e-06,
"loss": 0.0559,
"step": 8300
},
{
"epoch": 4.194851085310449,
"grad_norm": 0.49758586287498474,
"learning_rate": 4.830893488137305e-06,
"loss": 0.0506,
"step": 8310
},
{
"epoch": 4.1998990408884405,
"grad_norm": 0.4497081935405731,
"learning_rate": 4.800605754669359e-06,
"loss": 0.0581,
"step": 8320
},
{
"epoch": 4.204946996466431,
"grad_norm": 0.6054022312164307,
"learning_rate": 4.770318021201413e-06,
"loss": 0.0596,
"step": 8330
},
{
"epoch": 4.209994952044422,
"grad_norm": 0.7262012958526611,
"learning_rate": 4.7400302877334685e-06,
"loss": 0.0489,
"step": 8340
},
{
"epoch": 4.215042907622413,
"grad_norm": 0.6226342916488647,
"learning_rate": 4.709742554265523e-06,
"loss": 0.0596,
"step": 8350
},
{
"epoch": 4.2200908632004035,
"grad_norm": 0.8234953284263611,
"learning_rate": 4.679454820797577e-06,
"loss": 0.057,
"step": 8360
},
{
"epoch": 4.225138818778395,
"grad_norm": 0.8438859581947327,
"learning_rate": 4.649167087329631e-06,
"loss": 0.0516,
"step": 8370
},
{
"epoch": 4.230186774356386,
"grad_norm": 0.5095875263214111,
"learning_rate": 4.6188793538616865e-06,
"loss": 0.0646,
"step": 8380
},
{
"epoch": 4.235234729934376,
"grad_norm": 0.5543855428695679,
"learning_rate": 4.58859162039374e-06,
"loss": 0.0482,
"step": 8390
},
{
"epoch": 4.240282685512367,
"grad_norm": 0.7510880827903748,
"learning_rate": 4.558303886925795e-06,
"loss": 0.0595,
"step": 8400
},
{
"epoch": 4.245330641090359,
"grad_norm": 0.5140940546989441,
"learning_rate": 4.52801615345785e-06,
"loss": 0.0568,
"step": 8410
},
{
"epoch": 4.250378596668349,
"grad_norm": 0.43089789152145386,
"learning_rate": 4.497728419989904e-06,
"loss": 0.058,
"step": 8420
},
{
"epoch": 4.25542655224634,
"grad_norm": 0.6229716539382935,
"learning_rate": 4.467440686521959e-06,
"loss": 0.0538,
"step": 8430
},
{
"epoch": 4.260474507824331,
"grad_norm": 0.6465341448783875,
"learning_rate": 4.437152953054013e-06,
"loss": 0.0544,
"step": 8440
},
{
"epoch": 4.265522463402322,
"grad_norm": 0.42706695199012756,
"learning_rate": 4.406865219586068e-06,
"loss": 0.0562,
"step": 8450
},
{
"epoch": 4.270570418980313,
"grad_norm": 0.5305337309837341,
"learning_rate": 4.376577486118122e-06,
"loss": 0.0567,
"step": 8460
},
{
"epoch": 4.275618374558304,
"grad_norm": 0.7307097315788269,
"learning_rate": 4.346289752650177e-06,
"loss": 0.0486,
"step": 8470
},
{
"epoch": 4.280666330136295,
"grad_norm": 0.5940870046615601,
"learning_rate": 4.316002019182232e-06,
"loss": 0.0514,
"step": 8480
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.4446733593940735,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.0545,
"step": 8490
},
{
"epoch": 4.290762241292277,
"grad_norm": 0.9121294617652893,
"learning_rate": 4.255426552246341e-06,
"loss": 0.0557,
"step": 8500
},
{
"epoch": 4.295810196870267,
"grad_norm": 0.568056583404541,
"learning_rate": 4.225138818778395e-06,
"loss": 0.0522,
"step": 8510
},
{
"epoch": 4.300858152448258,
"grad_norm": 0.8788109421730042,
"learning_rate": 4.194851085310449e-06,
"loss": 0.0433,
"step": 8520
},
{
"epoch": 4.3059061080262495,
"grad_norm": 0.7445030808448792,
"learning_rate": 4.1645633518425035e-06,
"loss": 0.05,
"step": 8530
},
{
"epoch": 4.310954063604241,
"grad_norm": 0.8348667621612549,
"learning_rate": 4.134275618374559e-06,
"loss": 0.0584,
"step": 8540
},
{
"epoch": 4.316002019182231,
"grad_norm": 0.462342232465744,
"learning_rate": 4.103987884906613e-06,
"loss": 0.0555,
"step": 8550
},
{
"epoch": 4.321049974760222,
"grad_norm": 0.42785176634788513,
"learning_rate": 4.073700151438667e-06,
"loss": 0.0607,
"step": 8560
},
{
"epoch": 4.326097930338213,
"grad_norm": 0.7172122597694397,
"learning_rate": 4.043412417970722e-06,
"loss": 0.0675,
"step": 8570
},
{
"epoch": 4.331145885916204,
"grad_norm": 0.4495554566383362,
"learning_rate": 4.013124684502776e-06,
"loss": 0.0546,
"step": 8580
},
{
"epoch": 4.336193841494195,
"grad_norm": 0.5083460807800293,
"learning_rate": 3.982836951034831e-06,
"loss": 0.06,
"step": 8590
},
{
"epoch": 4.341241797072186,
"grad_norm": 0.4353145360946655,
"learning_rate": 3.952549217566885e-06,
"loss": 0.0535,
"step": 8600
},
{
"epoch": 4.3462897526501765,
"grad_norm": 0.6741386651992798,
"learning_rate": 3.92226148409894e-06,
"loss": 0.0581,
"step": 8610
},
{
"epoch": 4.351337708228168,
"grad_norm": 0.47798269987106323,
"learning_rate": 3.891973750630995e-06,
"loss": 0.0541,
"step": 8620
},
{
"epoch": 4.356385663806159,
"grad_norm": 0.49109166860580444,
"learning_rate": 3.861686017163049e-06,
"loss": 0.0608,
"step": 8630
},
{
"epoch": 4.361433619384149,
"grad_norm": 0.8310505747795105,
"learning_rate": 3.831398283695104e-06,
"loss": 0.0514,
"step": 8640
},
{
"epoch": 4.36648157496214,
"grad_norm": 0.4586045742034912,
"learning_rate": 3.801110550227158e-06,
"loss": 0.0538,
"step": 8650
},
{
"epoch": 4.371529530540132,
"grad_norm": 0.4350300133228302,
"learning_rate": 3.7708228167592127e-06,
"loss": 0.0526,
"step": 8660
},
{
"epoch": 4.376577486118122,
"grad_norm": 0.6310685276985168,
"learning_rate": 3.740535083291267e-06,
"loss": 0.0597,
"step": 8670
},
{
"epoch": 4.381625441696113,
"grad_norm": 0.6845548152923584,
"learning_rate": 3.7102473498233217e-06,
"loss": 0.0542,
"step": 8680
},
{
"epoch": 4.386673397274104,
"grad_norm": 1.085631012916565,
"learning_rate": 3.679959616355376e-06,
"loss": 0.0601,
"step": 8690
},
{
"epoch": 4.391721352852095,
"grad_norm": 0.6232538223266602,
"learning_rate": 3.6496718828874303e-06,
"loss": 0.0557,
"step": 8700
},
{
"epoch": 4.396769308430086,
"grad_norm": 0.4568091630935669,
"learning_rate": 3.6193841494194855e-06,
"loss": 0.0494,
"step": 8710
},
{
"epoch": 4.401817264008077,
"grad_norm": 0.7550612092018127,
"learning_rate": 3.5890964159515398e-06,
"loss": 0.0562,
"step": 8720
},
{
"epoch": 4.406865219586067,
"grad_norm": 0.5380585789680481,
"learning_rate": 3.5588086824835945e-06,
"loss": 0.0521,
"step": 8730
},
{
"epoch": 4.411913175164059,
"grad_norm": 0.42225027084350586,
"learning_rate": 3.5285209490156488e-06,
"loss": 0.0515,
"step": 8740
},
{
"epoch": 4.41696113074205,
"grad_norm": 0.5831999778747559,
"learning_rate": 3.498233215547703e-06,
"loss": 0.0465,
"step": 8750
},
{
"epoch": 4.42200908632004,
"grad_norm": 0.7943524718284607,
"learning_rate": 3.4679454820797578e-06,
"loss": 0.062,
"step": 8760
},
{
"epoch": 4.427057041898031,
"grad_norm": 0.634747326374054,
"learning_rate": 3.437657748611812e-06,
"loss": 0.0496,
"step": 8770
},
{
"epoch": 4.4321049974760225,
"grad_norm": 0.5734288692474365,
"learning_rate": 3.407370015143867e-06,
"loss": 0.0612,
"step": 8780
},
{
"epoch": 4.437152953054013,
"grad_norm": 0.7079018354415894,
"learning_rate": 3.3770822816759215e-06,
"loss": 0.0578,
"step": 8790
},
{
"epoch": 4.442200908632004,
"grad_norm": 0.44444698095321655,
"learning_rate": 3.346794548207976e-06,
"loss": 0.0559,
"step": 8800
},
{
"epoch": 4.447248864209995,
"grad_norm": 0.7473122477531433,
"learning_rate": 3.3165068147400305e-06,
"loss": 0.0544,
"step": 8810
},
{
"epoch": 4.4522968197879855,
"grad_norm": 0.6658338308334351,
"learning_rate": 3.286219081272085e-06,
"loss": 0.0552,
"step": 8820
},
{
"epoch": 4.457344775365977,
"grad_norm": 0.48870500922203064,
"learning_rate": 3.255931347804139e-06,
"loss": 0.0566,
"step": 8830
},
{
"epoch": 4.462392730943968,
"grad_norm": 0.6261917948722839,
"learning_rate": 3.2256436143361943e-06,
"loss": 0.0487,
"step": 8840
},
{
"epoch": 4.467440686521958,
"grad_norm": 0.6060011982917786,
"learning_rate": 3.1953558808682486e-06,
"loss": 0.0514,
"step": 8850
},
{
"epoch": 4.4724886420999495,
"grad_norm": 0.4858971834182739,
"learning_rate": 3.165068147400303e-06,
"loss": 0.05,
"step": 8860
},
{
"epoch": 4.477536597677941,
"grad_norm": 0.6394979357719421,
"learning_rate": 3.1347804139323576e-06,
"loss": 0.0604,
"step": 8870
},
{
"epoch": 4.482584553255931,
"grad_norm": 0.6840482950210571,
"learning_rate": 3.104492680464412e-06,
"loss": 0.0514,
"step": 8880
},
{
"epoch": 4.487632508833922,
"grad_norm": 0.388715535402298,
"learning_rate": 3.0742049469964666e-06,
"loss": 0.0479,
"step": 8890
},
{
"epoch": 4.492680464411913,
"grad_norm": 0.6516565084457397,
"learning_rate": 3.043917213528521e-06,
"loss": 0.0608,
"step": 8900
},
{
"epoch": 4.497728419989904,
"grad_norm": 0.76282799243927,
"learning_rate": 3.0136294800605756e-06,
"loss": 0.0572,
"step": 8910
},
{
"epoch": 4.502776375567895,
"grad_norm": 0.49448370933532715,
"learning_rate": 2.9833417465926303e-06,
"loss": 0.0575,
"step": 8920
},
{
"epoch": 4.507824331145886,
"grad_norm": 0.5593730807304382,
"learning_rate": 2.9530540131246846e-06,
"loss": 0.0486,
"step": 8930
},
{
"epoch": 4.512872286723876,
"grad_norm": 0.5773325562477112,
"learning_rate": 2.922766279656739e-06,
"loss": 0.0541,
"step": 8940
},
{
"epoch": 4.517920242301868,
"grad_norm": 0.34630000591278076,
"learning_rate": 2.8924785461887936e-06,
"loss": 0.0606,
"step": 8950
},
{
"epoch": 4.522968197879859,
"grad_norm": 0.5409483313560486,
"learning_rate": 2.862190812720848e-06,
"loss": 0.0589,
"step": 8960
},
{
"epoch": 4.52801615345785,
"grad_norm": 0.5004202127456665,
"learning_rate": 2.8319030792529026e-06,
"loss": 0.0621,
"step": 8970
},
{
"epoch": 4.53306410903584,
"grad_norm": 0.4979722797870636,
"learning_rate": 2.8016153457849574e-06,
"loss": 0.0537,
"step": 8980
},
{
"epoch": 4.5381120646138315,
"grad_norm": 0.6733251214027405,
"learning_rate": 2.7713276123170117e-06,
"loss": 0.069,
"step": 8990
},
{
"epoch": 4.543160020191822,
"grad_norm": 0.4152880609035492,
"learning_rate": 2.7410398788490664e-06,
"loss": 0.0565,
"step": 9000
},
{
"epoch": 4.548207975769813,
"grad_norm": 0.6170037984848022,
"learning_rate": 2.7107521453811207e-06,
"loss": 0.0589,
"step": 9010
},
{
"epoch": 4.553255931347804,
"grad_norm": 0.5258937478065491,
"learning_rate": 2.680464411913175e-06,
"loss": 0.0548,
"step": 9020
},
{
"epoch": 4.5583038869257955,
"grad_norm": 0.534015417098999,
"learning_rate": 2.6501766784452297e-06,
"loss": 0.0447,
"step": 9030
},
{
"epoch": 4.563351842503786,
"grad_norm": 0.86041259765625,
"learning_rate": 2.6198889449772844e-06,
"loss": 0.0578,
"step": 9040
},
{
"epoch": 4.568399798081777,
"grad_norm": 0.8807480335235596,
"learning_rate": 2.589601211509339e-06,
"loss": 0.0479,
"step": 9050
},
{
"epoch": 4.573447753659767,
"grad_norm": 0.6071127653121948,
"learning_rate": 2.5593134780413934e-06,
"loss": 0.0521,
"step": 9060
},
{
"epoch": 4.5784957092377585,
"grad_norm": 0.9106950759887695,
"learning_rate": 2.5290257445734477e-06,
"loss": 0.056,
"step": 9070
},
{
"epoch": 4.58354366481575,
"grad_norm": 0.6179044246673584,
"learning_rate": 2.4987380111055024e-06,
"loss": 0.0548,
"step": 9080
},
{
"epoch": 4.588591620393741,
"grad_norm": 0.9295970797538757,
"learning_rate": 2.4684502776375567e-06,
"loss": 0.0626,
"step": 9090
},
{
"epoch": 4.593639575971731,
"grad_norm": 0.4483726918697357,
"learning_rate": 2.438162544169611e-06,
"loss": 0.0531,
"step": 9100
},
{
"epoch": 4.598687531549722,
"grad_norm": 0.38749760389328003,
"learning_rate": 2.407874810701666e-06,
"loss": 0.0514,
"step": 9110
},
{
"epoch": 4.603735487127714,
"grad_norm": 0.7203320860862732,
"learning_rate": 2.3775870772337205e-06,
"loss": 0.0603,
"step": 9120
},
{
"epoch": 4.608783442705704,
"grad_norm": 0.8010473251342773,
"learning_rate": 2.347299343765775e-06,
"loss": 0.053,
"step": 9130
},
{
"epoch": 4.613831398283695,
"grad_norm": 0.7866964936256409,
"learning_rate": 2.3170116102978295e-06,
"loss": 0.0544,
"step": 9140
},
{
"epoch": 4.618879353861686,
"grad_norm": 0.9333378076553345,
"learning_rate": 2.2867238768298838e-06,
"loss": 0.0472,
"step": 9150
},
{
"epoch": 4.623927309439677,
"grad_norm": 0.5904621481895447,
"learning_rate": 2.2564361433619385e-06,
"loss": 0.0515,
"step": 9160
},
{
"epoch": 4.628975265017668,
"grad_norm": 0.6837446093559265,
"learning_rate": 2.2261484098939928e-06,
"loss": 0.0566,
"step": 9170
},
{
"epoch": 4.634023220595659,
"grad_norm": 0.5726220607757568,
"learning_rate": 2.1958606764260475e-06,
"loss": 0.0521,
"step": 9180
},
{
"epoch": 4.639071176173649,
"grad_norm": 0.5920945405960083,
"learning_rate": 2.1655729429581022e-06,
"loss": 0.0527,
"step": 9190
},
{
"epoch": 4.644119131751641,
"grad_norm": 0.5921088457107544,
"learning_rate": 2.1352852094901565e-06,
"loss": 0.0594,
"step": 9200
},
{
"epoch": 4.649167087329632,
"grad_norm": 0.8026402592658997,
"learning_rate": 2.1049974760222112e-06,
"loss": 0.058,
"step": 9210
},
{
"epoch": 4.654215042907622,
"grad_norm": 0.9913181066513062,
"learning_rate": 2.0747097425542655e-06,
"loss": 0.0591,
"step": 9220
},
{
"epoch": 4.659262998485613,
"grad_norm": 0.675123393535614,
"learning_rate": 2.04442200908632e-06,
"loss": 0.0561,
"step": 9230
},
{
"epoch": 4.6643109540636045,
"grad_norm": 0.5947641730308533,
"learning_rate": 2.014134275618375e-06,
"loss": 0.0486,
"step": 9240
},
{
"epoch": 4.669358909641595,
"grad_norm": 0.5389765501022339,
"learning_rate": 1.9838465421504293e-06,
"loss": 0.0586,
"step": 9250
},
{
"epoch": 4.674406865219586,
"grad_norm": 0.5905711054801941,
"learning_rate": 1.9535588086824836e-06,
"loss": 0.0523,
"step": 9260
},
{
"epoch": 4.679454820797577,
"grad_norm": 0.36754655838012695,
"learning_rate": 1.9232710752145383e-06,
"loss": 0.0518,
"step": 9270
},
{
"epoch": 4.684502776375568,
"grad_norm": 0.5583412647247314,
"learning_rate": 1.8929833417465926e-06,
"loss": 0.0536,
"step": 9280
},
{
"epoch": 4.689550731953559,
"grad_norm": 0.4586925506591797,
"learning_rate": 1.8626956082786473e-06,
"loss": 0.0482,
"step": 9290
},
{
"epoch": 4.69459868753155,
"grad_norm": 0.4932919442653656,
"learning_rate": 1.8324078748107018e-06,
"loss": 0.0484,
"step": 9300
},
{
"epoch": 4.69964664310954,
"grad_norm": 0.3211473524570465,
"learning_rate": 1.802120141342756e-06,
"loss": 0.0522,
"step": 9310
},
{
"epoch": 4.7046945986875315,
"grad_norm": 0.8603491187095642,
"learning_rate": 1.7718324078748106e-06,
"loss": 0.0585,
"step": 9320
},
{
"epoch": 4.709742554265523,
"grad_norm": 0.7181740999221802,
"learning_rate": 1.7415446744068653e-06,
"loss": 0.0522,
"step": 9330
},
{
"epoch": 4.714790509843513,
"grad_norm": 0.49415314197540283,
"learning_rate": 1.7112569409389198e-06,
"loss": 0.0417,
"step": 9340
},
{
"epoch": 4.719838465421504,
"grad_norm": 0.758638322353363,
"learning_rate": 1.6809692074709741e-06,
"loss": 0.0608,
"step": 9350
},
{
"epoch": 4.724886420999495,
"grad_norm": 0.6659887433052063,
"learning_rate": 1.6506814740030288e-06,
"loss": 0.0468,
"step": 9360
},
{
"epoch": 4.729934376577486,
"grad_norm": 0.3270837962627411,
"learning_rate": 1.6203937405350833e-06,
"loss": 0.0602,
"step": 9370
},
{
"epoch": 4.734982332155477,
"grad_norm": 0.6695159077644348,
"learning_rate": 1.5901060070671379e-06,
"loss": 0.0515,
"step": 9380
},
{
"epoch": 4.740030287733468,
"grad_norm": 0.8143603205680847,
"learning_rate": 1.5598182735991924e-06,
"loss": 0.0613,
"step": 9390
},
{
"epoch": 4.745078243311459,
"grad_norm": 0.6727936863899231,
"learning_rate": 1.5295305401312469e-06,
"loss": 0.0505,
"step": 9400
},
{
"epoch": 4.75012619888945,
"grad_norm": 0.5365564823150635,
"learning_rate": 1.4992428066633014e-06,
"loss": 0.0512,
"step": 9410
},
{
"epoch": 4.755174154467441,
"grad_norm": 0.5240725874900818,
"learning_rate": 1.4689550731953559e-06,
"loss": 0.0526,
"step": 9420
},
{
"epoch": 4.760222110045431,
"grad_norm": 0.6975441575050354,
"learning_rate": 1.4386673397274104e-06,
"loss": 0.0592,
"step": 9430
},
{
"epoch": 4.765270065623422,
"grad_norm": 0.44649407267570496,
"learning_rate": 1.408379606259465e-06,
"loss": 0.0597,
"step": 9440
},
{
"epoch": 4.770318021201414,
"grad_norm": 0.598850429058075,
"learning_rate": 1.3780918727915194e-06,
"loss": 0.0606,
"step": 9450
},
{
"epoch": 4.775365976779405,
"grad_norm": 0.57352614402771,
"learning_rate": 1.3478041393235741e-06,
"loss": 0.0502,
"step": 9460
},
{
"epoch": 4.780413932357395,
"grad_norm": 0.7437055706977844,
"learning_rate": 1.3175164058556284e-06,
"loss": 0.0521,
"step": 9470
},
{
"epoch": 4.785461887935386,
"grad_norm": 0.6993494629859924,
"learning_rate": 1.287228672387683e-06,
"loss": 0.0565,
"step": 9480
},
{
"epoch": 4.790509843513377,
"grad_norm": 0.8067084550857544,
"learning_rate": 1.2569409389197376e-06,
"loss": 0.0575,
"step": 9490
},
{
"epoch": 4.795557799091368,
"grad_norm": 0.5363942384719849,
"learning_rate": 1.2266532054517921e-06,
"loss": 0.058,
"step": 9500
},
{
"epoch": 4.800605754669359,
"grad_norm": 0.8145700693130493,
"learning_rate": 1.1963654719838464e-06,
"loss": 0.0488,
"step": 9510
},
{
"epoch": 4.80565371024735,
"grad_norm": 0.7701184153556824,
"learning_rate": 1.166077738515901e-06,
"loss": 0.0577,
"step": 9520
},
{
"epoch": 4.8107016658253405,
"grad_norm": 0.5177111625671387,
"learning_rate": 1.1357900050479557e-06,
"loss": 0.0605,
"step": 9530
},
{
"epoch": 4.815749621403332,
"grad_norm": 0.44751742482185364,
"learning_rate": 1.1055022715800102e-06,
"loss": 0.0565,
"step": 9540
},
{
"epoch": 4.820797576981323,
"grad_norm": 0.37919309735298157,
"learning_rate": 1.0752145381120645e-06,
"loss": 0.0454,
"step": 9550
},
{
"epoch": 4.825845532559313,
"grad_norm": 0.6037785410881042,
"learning_rate": 1.0449268046441192e-06,
"loss": 0.0606,
"step": 9560
},
{
"epoch": 4.8308934881373045,
"grad_norm": 0.3584793508052826,
"learning_rate": 1.0146390711761737e-06,
"loss": 0.0503,
"step": 9570
},
{
"epoch": 4.835941443715296,
"grad_norm": 0.49841853976249695,
"learning_rate": 9.843513377082282e-07,
"loss": 0.0434,
"step": 9580
},
{
"epoch": 4.840989399293286,
"grad_norm": 0.5114769339561462,
"learning_rate": 9.540636042402827e-07,
"loss": 0.0535,
"step": 9590
},
{
"epoch": 4.846037354871277,
"grad_norm": 0.5932824611663818,
"learning_rate": 9.237758707723372e-07,
"loss": 0.0547,
"step": 9600
},
{
"epoch": 4.851085310449268,
"grad_norm": 0.6020333766937256,
"learning_rate": 8.934881373043917e-07,
"loss": 0.0597,
"step": 9610
},
{
"epoch": 4.856133266027259,
"grad_norm": 0.721193790435791,
"learning_rate": 8.632004038364462e-07,
"loss": 0.0614,
"step": 9620
},
{
"epoch": 4.86118122160525,
"grad_norm": 0.4858354926109314,
"learning_rate": 8.329126703685008e-07,
"loss": 0.0555,
"step": 9630
},
{
"epoch": 4.866229177183241,
"grad_norm": 0.7863103747367859,
"learning_rate": 8.026249369005552e-07,
"loss": 0.0554,
"step": 9640
},
{
"epoch": 4.871277132761231,
"grad_norm": 0.8363025784492493,
"learning_rate": 7.723372034326099e-07,
"loss": 0.0565,
"step": 9650
},
{
"epoch": 4.876325088339223,
"grad_norm": 0.6137521266937256,
"learning_rate": 7.420494699646643e-07,
"loss": 0.0575,
"step": 9660
},
{
"epoch": 4.881373043917214,
"grad_norm": 0.4781091511249542,
"learning_rate": 7.117617364967189e-07,
"loss": 0.0478,
"step": 9670
},
{
"epoch": 4.886420999495204,
"grad_norm": 0.8294112086296082,
"learning_rate": 6.814740030287734e-07,
"loss": 0.0593,
"step": 9680
},
{
"epoch": 4.891468955073195,
"grad_norm": 0.5780894160270691,
"learning_rate": 6.511862695608279e-07,
"loss": 0.0518,
"step": 9690
},
{
"epoch": 4.8965169106511865,
"grad_norm": 0.4407060146331787,
"learning_rate": 6.208985360928824e-07,
"loss": 0.0522,
"step": 9700
},
{
"epoch": 4.901564866229177,
"grad_norm": 0.4369337558746338,
"learning_rate": 5.906108026249369e-07,
"loss": 0.0522,
"step": 9710
},
{
"epoch": 4.906612821807168,
"grad_norm": 0.8428089022636414,
"learning_rate": 5.603230691569914e-07,
"loss": 0.0468,
"step": 9720
},
{
"epoch": 4.911660777385159,
"grad_norm": 0.6303294897079468,
"learning_rate": 5.30035335689046e-07,
"loss": 0.0577,
"step": 9730
},
{
"epoch": 4.91670873296315,
"grad_norm": 0.4869242012500763,
"learning_rate": 4.997476022211004e-07,
"loss": 0.0472,
"step": 9740
},
{
"epoch": 4.921756688541141,
"grad_norm": 0.5907611846923828,
"learning_rate": 4.69459868753155e-07,
"loss": 0.0455,
"step": 9750
},
{
"epoch": 4.926804644119132,
"grad_norm": 0.6162139177322388,
"learning_rate": 4.3917213528520954e-07,
"loss": 0.0475,
"step": 9760
},
{
"epoch": 4.931852599697122,
"grad_norm": 0.5222154259681702,
"learning_rate": 4.0888440181726405e-07,
"loss": 0.0513,
"step": 9770
},
{
"epoch": 4.9369005552751135,
"grad_norm": 0.5132977366447449,
"learning_rate": 3.7859666834931856e-07,
"loss": 0.043,
"step": 9780
},
{
"epoch": 4.941948510853105,
"grad_norm": 0.6620015501976013,
"learning_rate": 3.4830893488137306e-07,
"loss": 0.0598,
"step": 9790
},
{
"epoch": 4.946996466431095,
"grad_norm": 0.7160341143608093,
"learning_rate": 3.1802120141342757e-07,
"loss": 0.0539,
"step": 9800
},
{
"epoch": 4.952044422009086,
"grad_norm": 0.5954631567001343,
"learning_rate": 2.8773346794548213e-07,
"loss": 0.0581,
"step": 9810
},
{
"epoch": 4.957092377587077,
"grad_norm": 1.0010461807250977,
"learning_rate": 2.5744573447753664e-07,
"loss": 0.0499,
"step": 9820
},
{
"epoch": 4.962140333165069,
"grad_norm": 0.5768128633499146,
"learning_rate": 2.2715800100959112e-07,
"loss": 0.0562,
"step": 9830
},
{
"epoch": 4.967188288743059,
"grad_norm": 0.6427052617073059,
"learning_rate": 1.9687026754164563e-07,
"loss": 0.0545,
"step": 9840
},
{
"epoch": 4.97223624432105,
"grad_norm": 0.6932212114334106,
"learning_rate": 1.6658253407370016e-07,
"loss": 0.0575,
"step": 9850
},
{
"epoch": 4.9772841998990405,
"grad_norm": 0.4219547510147095,
"learning_rate": 1.3629480060575467e-07,
"loss": 0.0491,
"step": 9860
},
{
"epoch": 4.982332155477032,
"grad_norm": 0.5215485692024231,
"learning_rate": 1.0600706713780919e-07,
"loss": 0.0438,
"step": 9870
},
{
"epoch": 4.987380111055023,
"grad_norm": 0.36851760745048523,
"learning_rate": 7.57193336698637e-08,
"loss": 0.052,
"step": 9880
},
{
"epoch": 4.992428066633014,
"grad_norm": 0.5213483572006226,
"learning_rate": 4.5431600201918226e-08,
"loss": 0.0472,
"step": 9890
},
{
"epoch": 4.997476022211004,
"grad_norm": 0.710657000541687,
"learning_rate": 1.514386673397274e-08,
"loss": 0.0582,
"step": 9900
},
{
"epoch": 5.0,
"eval_f1": 0.9705180789481339,
"eval_loss": 0.03909851238131523,
"eval_runtime": 579.4034,
"eval_samples_per_second": 355.99,
"eval_steps_per_second": 2.782,
"step": 9905
},
{
"epoch": 5.0,
"step": 9905,
"total_flos": 9.82152667464321e+19,
"train_loss": 0.0,
"train_runtime": 0.0648,
"train_samples_per_second": 19542495.273,
"train_steps_per_second": 152740.8
}
],
"logging_steps": 10,
"max_steps": 9905,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.82152667464321e+19,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}