70b-mp20-13500 / trainer_state.json
JennyGan's picture
Upload 70B model checkpoint-13500
8df5d41 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9799528301886795,
"eval_steps": 1000,
"global_step": 13500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00294811320754717,
"grad_norm": 1.0473709106445312,
"learning_rate": 1e-05,
"loss": 1.6996,
"step": 10
},
{
"epoch": 0.00589622641509434,
"grad_norm": 1.3841967582702637,
"learning_rate": 2e-05,
"loss": 1.6733,
"step": 20
},
{
"epoch": 0.00884433962264151,
"grad_norm": 0.9989091753959656,
"learning_rate": 3e-05,
"loss": 1.7351,
"step": 30
},
{
"epoch": 0.01179245283018868,
"grad_norm": 2.2353155612945557,
"learning_rate": 4e-05,
"loss": 1.4985,
"step": 40
},
{
"epoch": 0.01474056603773585,
"grad_norm": 1.6346133947372437,
"learning_rate": 5e-05,
"loss": 1.234,
"step": 50
},
{
"epoch": 0.01768867924528302,
"grad_norm": 2.0600874423980713,
"learning_rate": 6e-05,
"loss": 0.916,
"step": 60
},
{
"epoch": 0.020636792452830188,
"grad_norm": 1.7062417268753052,
"learning_rate": 7e-05,
"loss": 0.7374,
"step": 70
},
{
"epoch": 0.02358490566037736,
"grad_norm": 2.0386404991149902,
"learning_rate": 8e-05,
"loss": 0.6934,
"step": 80
},
{
"epoch": 0.02653301886792453,
"grad_norm": 1.927161455154419,
"learning_rate": 9e-05,
"loss": 0.6661,
"step": 90
},
{
"epoch": 0.0294811320754717,
"grad_norm": 1.1320207118988037,
"learning_rate": 0.0001,
"loss": 0.668,
"step": 100
},
{
"epoch": 0.03242924528301887,
"grad_norm": 0.9954987168312073,
"learning_rate": 9.999997842789546e-05,
"loss": 0.6576,
"step": 110
},
{
"epoch": 0.03537735849056604,
"grad_norm": 1.770910620689392,
"learning_rate": 9.999991371160044e-05,
"loss": 0.6555,
"step": 120
},
{
"epoch": 0.038325471698113206,
"grad_norm": 1.1498017311096191,
"learning_rate": 9.99998058511708e-05,
"loss": 0.6081,
"step": 130
},
{
"epoch": 0.041273584905660375,
"grad_norm": 0.8577329516410828,
"learning_rate": 9.99996548466996e-05,
"loss": 0.5803,
"step": 140
},
{
"epoch": 0.044221698113207544,
"grad_norm": 1.1237112283706665,
"learning_rate": 9.999946069831714e-05,
"loss": 0.6287,
"step": 150
},
{
"epoch": 0.04716981132075472,
"grad_norm": 1.6234890222549438,
"learning_rate": 9.999922340619094e-05,
"loss": 0.5979,
"step": 160
},
{
"epoch": 0.05011792452830189,
"grad_norm": 1.2322404384613037,
"learning_rate": 9.999894297052576e-05,
"loss": 0.609,
"step": 170
},
{
"epoch": 0.05306603773584906,
"grad_norm": 1.241513729095459,
"learning_rate": 9.99986193915636e-05,
"loss": 0.583,
"step": 180
},
{
"epoch": 0.05601415094339623,
"grad_norm": 0.7771233916282654,
"learning_rate": 9.999825266958367e-05,
"loss": 0.5751,
"step": 190
},
{
"epoch": 0.0589622641509434,
"grad_norm": 0.9325250387191772,
"learning_rate": 9.999784280490239e-05,
"loss": 0.5806,
"step": 200
},
{
"epoch": 0.061910377358490566,
"grad_norm": 1.0086380243301392,
"learning_rate": 9.999738979787342e-05,
"loss": 0.5637,
"step": 210
},
{
"epoch": 0.06485849056603774,
"grad_norm": 0.813523530960083,
"learning_rate": 9.999689364888767e-05,
"loss": 0.5653,
"step": 220
},
{
"epoch": 0.06780660377358491,
"grad_norm": 0.7476902008056641,
"learning_rate": 9.999635435837326e-05,
"loss": 0.5752,
"step": 230
},
{
"epoch": 0.07075471698113207,
"grad_norm": 0.9477988481521606,
"learning_rate": 9.999577192679552e-05,
"loss": 0.5771,
"step": 240
},
{
"epoch": 0.07370283018867925,
"grad_norm": 1.0138146877288818,
"learning_rate": 9.999514635465705e-05,
"loss": 0.5539,
"step": 250
},
{
"epoch": 0.07665094339622641,
"grad_norm": 1.0521008968353271,
"learning_rate": 9.999447764249762e-05,
"loss": 0.5596,
"step": 260
},
{
"epoch": 0.07959905660377359,
"grad_norm": 0.8997015357017517,
"learning_rate": 9.999376579089426e-05,
"loss": 0.5771,
"step": 270
},
{
"epoch": 0.08254716981132075,
"grad_norm": 0.7213010191917419,
"learning_rate": 9.99930108004612e-05,
"loss": 0.5572,
"step": 280
},
{
"epoch": 0.08549528301886793,
"grad_norm": 1.5777337551116943,
"learning_rate": 9.999221267184993e-05,
"loss": 0.5321,
"step": 290
},
{
"epoch": 0.08844339622641509,
"grad_norm": 0.8426280617713928,
"learning_rate": 9.999137140574914e-05,
"loss": 0.5709,
"step": 300
},
{
"epoch": 0.09139150943396226,
"grad_norm": 0.777661919593811,
"learning_rate": 9.999048700288475e-05,
"loss": 0.5553,
"step": 310
},
{
"epoch": 0.09433962264150944,
"grad_norm": 0.891504168510437,
"learning_rate": 9.998955946401986e-05,
"loss": 0.5666,
"step": 320
},
{
"epoch": 0.0972877358490566,
"grad_norm": 1.0041019916534424,
"learning_rate": 9.99885887899549e-05,
"loss": 0.5412,
"step": 330
},
{
"epoch": 0.10023584905660378,
"grad_norm": 0.8665509819984436,
"learning_rate": 9.998757498152737e-05,
"loss": 0.5592,
"step": 340
},
{
"epoch": 0.10318396226415094,
"grad_norm": 0.8766894340515137,
"learning_rate": 9.998651803961212e-05,
"loss": 0.5071,
"step": 350
},
{
"epoch": 0.10613207547169812,
"grad_norm": 0.7872467041015625,
"learning_rate": 9.998541796512116e-05,
"loss": 0.5082,
"step": 360
},
{
"epoch": 0.10908018867924528,
"grad_norm": 0.8144010305404663,
"learning_rate": 9.99842747590037e-05,
"loss": 0.5312,
"step": 370
},
{
"epoch": 0.11202830188679246,
"grad_norm": 0.7267995476722717,
"learning_rate": 9.998308842224623e-05,
"loss": 0.5237,
"step": 380
},
{
"epoch": 0.11497641509433962,
"grad_norm": 0.8450999855995178,
"learning_rate": 9.99818589558724e-05,
"loss": 0.5154,
"step": 390
},
{
"epoch": 0.1179245283018868,
"grad_norm": 0.7647215723991394,
"learning_rate": 9.998058636094312e-05,
"loss": 0.4965,
"step": 400
},
{
"epoch": 0.12087264150943396,
"grad_norm": 0.8467835187911987,
"learning_rate": 9.997927063855646e-05,
"loss": 0.5037,
"step": 410
},
{
"epoch": 0.12382075471698113,
"grad_norm": 0.8728483319282532,
"learning_rate": 9.997791178984775e-05,
"loss": 0.4974,
"step": 420
},
{
"epoch": 0.1267688679245283,
"grad_norm": 2.1606669425964355,
"learning_rate": 9.997650981598953e-05,
"loss": 0.4961,
"step": 430
},
{
"epoch": 0.12971698113207547,
"grad_norm": 0.889072835445404,
"learning_rate": 9.997506471819153e-05,
"loss": 0.5001,
"step": 440
},
{
"epoch": 0.13266509433962265,
"grad_norm": 0.6952047944068909,
"learning_rate": 9.997357649770069e-05,
"loss": 0.5061,
"step": 450
},
{
"epoch": 0.13561320754716982,
"grad_norm": 0.7447547316551208,
"learning_rate": 9.997204515580121e-05,
"loss": 0.5053,
"step": 460
},
{
"epoch": 0.13856132075471697,
"grad_norm": 0.6645297408103943,
"learning_rate": 9.997047069381442e-05,
"loss": 0.5096,
"step": 470
},
{
"epoch": 0.14150943396226415,
"grad_norm": 0.8288812041282654,
"learning_rate": 9.996885311309891e-05,
"loss": 0.511,
"step": 480
},
{
"epoch": 0.14445754716981132,
"grad_norm": 0.8417885899543762,
"learning_rate": 9.996719241505049e-05,
"loss": 0.5004,
"step": 490
},
{
"epoch": 0.1474056603773585,
"grad_norm": 0.7015907168388367,
"learning_rate": 9.99654886011021e-05,
"loss": 0.5149,
"step": 500
},
{
"epoch": 0.15035377358490565,
"grad_norm": 0.8258867859840393,
"learning_rate": 9.9963741672724e-05,
"loss": 0.5117,
"step": 510
},
{
"epoch": 0.15330188679245282,
"grad_norm": 0.7268001437187195,
"learning_rate": 9.996195163142352e-05,
"loss": 0.4765,
"step": 520
},
{
"epoch": 0.15625,
"grad_norm": 0.8334497213363647,
"learning_rate": 9.996011847874531e-05,
"loss": 0.5202,
"step": 530
},
{
"epoch": 0.15919811320754718,
"grad_norm": 0.6987500190734863,
"learning_rate": 9.995824221627115e-05,
"loss": 0.4853,
"step": 540
},
{
"epoch": 0.16214622641509435,
"grad_norm": 0.7377192378044128,
"learning_rate": 9.995632284562002e-05,
"loss": 0.4856,
"step": 550
},
{
"epoch": 0.1650943396226415,
"grad_norm": 0.7948574423789978,
"learning_rate": 9.995436036844813e-05,
"loss": 0.4965,
"step": 560
},
{
"epoch": 0.16804245283018868,
"grad_norm": 0.9272993803024292,
"learning_rate": 9.995235478644887e-05,
"loss": 0.4806,
"step": 570
},
{
"epoch": 0.17099056603773585,
"grad_norm": 0.8587119579315186,
"learning_rate": 9.995030610135283e-05,
"loss": 0.5239,
"step": 580
},
{
"epoch": 0.17393867924528303,
"grad_norm": 0.705521821975708,
"learning_rate": 9.994821431492778e-05,
"loss": 0.4786,
"step": 590
},
{
"epoch": 0.17688679245283018,
"grad_norm": 0.8783085346221924,
"learning_rate": 9.994607942897869e-05,
"loss": 0.4808,
"step": 600
},
{
"epoch": 0.17983490566037735,
"grad_norm": 0.8379423022270203,
"learning_rate": 9.994390144534773e-05,
"loss": 0.4695,
"step": 610
},
{
"epoch": 0.18278301886792453,
"grad_norm": 0.8350498080253601,
"learning_rate": 9.994168036591423e-05,
"loss": 0.4759,
"step": 620
},
{
"epoch": 0.1857311320754717,
"grad_norm": 0.6481857299804688,
"learning_rate": 9.993941619259473e-05,
"loss": 0.4812,
"step": 630
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.6771122217178345,
"learning_rate": 9.993710892734296e-05,
"loss": 0.4972,
"step": 640
},
{
"epoch": 0.19162735849056603,
"grad_norm": 0.893562376499176,
"learning_rate": 9.993475857214982e-05,
"loss": 0.4883,
"step": 650
},
{
"epoch": 0.1945754716981132,
"grad_norm": 0.878492534160614,
"learning_rate": 9.993236512904338e-05,
"loss": 0.4798,
"step": 660
},
{
"epoch": 0.19752358490566038,
"grad_norm": 1.180234432220459,
"learning_rate": 9.992992860008892e-05,
"loss": 0.4783,
"step": 670
},
{
"epoch": 0.20047169811320756,
"grad_norm": 0.8628876209259033,
"learning_rate": 9.992744898738889e-05,
"loss": 0.5061,
"step": 680
},
{
"epoch": 0.2034198113207547,
"grad_norm": 0.8073515295982361,
"learning_rate": 9.992492629308287e-05,
"loss": 0.4962,
"step": 690
},
{
"epoch": 0.20636792452830188,
"grad_norm": 0.7668582797050476,
"learning_rate": 9.992236051934769e-05,
"loss": 0.4712,
"step": 700
},
{
"epoch": 0.20931603773584906,
"grad_norm": 0.9387280941009521,
"learning_rate": 9.99197516683973e-05,
"loss": 0.4786,
"step": 710
},
{
"epoch": 0.21226415094339623,
"grad_norm": 0.827477753162384,
"learning_rate": 9.991709974248284e-05,
"loss": 0.4961,
"step": 720
},
{
"epoch": 0.21521226415094338,
"grad_norm": 0.7233847379684448,
"learning_rate": 9.991440474389262e-05,
"loss": 0.5125,
"step": 730
},
{
"epoch": 0.21816037735849056,
"grad_norm": 0.8469201326370239,
"learning_rate": 9.991166667495209e-05,
"loss": 0.4961,
"step": 740
},
{
"epoch": 0.22110849056603774,
"grad_norm": 1.3453787565231323,
"learning_rate": 9.990888553802391e-05,
"loss": 0.4784,
"step": 750
},
{
"epoch": 0.2240566037735849,
"grad_norm": 0.6355937123298645,
"learning_rate": 9.990606133550788e-05,
"loss": 0.4603,
"step": 760
},
{
"epoch": 0.2270047169811321,
"grad_norm": 0.8897044062614441,
"learning_rate": 9.990319406984095e-05,
"loss": 0.4903,
"step": 770
},
{
"epoch": 0.22995283018867924,
"grad_norm": 0.7813559770584106,
"learning_rate": 9.990028374349723e-05,
"loss": 0.4842,
"step": 780
},
{
"epoch": 0.2329009433962264,
"grad_norm": 1.0318368673324585,
"learning_rate": 9.989733035898802e-05,
"loss": 0.4781,
"step": 790
},
{
"epoch": 0.2358490566037736,
"grad_norm": 0.6712438464164734,
"learning_rate": 9.989433391886172e-05,
"loss": 0.4838,
"step": 800
},
{
"epoch": 0.23879716981132076,
"grad_norm": 0.7659137845039368,
"learning_rate": 9.989129442570393e-05,
"loss": 0.4795,
"step": 810
},
{
"epoch": 0.2417452830188679,
"grad_norm": 0.7460992336273193,
"learning_rate": 9.988821188213737e-05,
"loss": 0.4721,
"step": 820
},
{
"epoch": 0.2446933962264151,
"grad_norm": 0.8464518189430237,
"learning_rate": 9.988508629082191e-05,
"loss": 0.4763,
"step": 830
},
{
"epoch": 0.24764150943396226,
"grad_norm": 0.7318540215492249,
"learning_rate": 9.988191765445461e-05,
"loss": 0.4719,
"step": 840
},
{
"epoch": 0.2505896226415094,
"grad_norm": 0.7888909578323364,
"learning_rate": 9.98787059757696e-05,
"loss": 0.4709,
"step": 850
},
{
"epoch": 0.2535377358490566,
"grad_norm": 0.8116426467895508,
"learning_rate": 9.987545125753819e-05,
"loss": 0.4767,
"step": 860
},
{
"epoch": 0.25648584905660377,
"grad_norm": 0.8112141489982605,
"learning_rate": 9.987215350256885e-05,
"loss": 0.5031,
"step": 870
},
{
"epoch": 0.25943396226415094,
"grad_norm": 0.8783886432647705,
"learning_rate": 9.986881271370714e-05,
"loss": 0.4591,
"step": 880
},
{
"epoch": 0.2623820754716981,
"grad_norm": 0.9753439426422119,
"learning_rate": 9.986542889383576e-05,
"loss": 0.4678,
"step": 890
},
{
"epoch": 0.2653301886792453,
"grad_norm": 0.7954875826835632,
"learning_rate": 9.986200204587459e-05,
"loss": 0.4789,
"step": 900
},
{
"epoch": 0.26827830188679247,
"grad_norm": 0.8161525130271912,
"learning_rate": 9.985853217278058e-05,
"loss": 0.4609,
"step": 910
},
{
"epoch": 0.27122641509433965,
"grad_norm": 0.7179375886917114,
"learning_rate": 9.985501927754783e-05,
"loss": 0.4862,
"step": 920
},
{
"epoch": 0.27417452830188677,
"grad_norm": 0.8070648312568665,
"learning_rate": 9.985146336320759e-05,
"loss": 0.468,
"step": 930
},
{
"epoch": 0.27712264150943394,
"grad_norm": 0.8168680667877197,
"learning_rate": 9.984786443282816e-05,
"loss": 0.4517,
"step": 940
},
{
"epoch": 0.2800707547169811,
"grad_norm": 0.6777118444442749,
"learning_rate": 9.984422248951501e-05,
"loss": 0.4537,
"step": 950
},
{
"epoch": 0.2830188679245283,
"grad_norm": 0.6938192248344421,
"learning_rate": 9.984053753641073e-05,
"loss": 0.4668,
"step": 960
},
{
"epoch": 0.28596698113207547,
"grad_norm": 0.7651636004447937,
"learning_rate": 9.983680957669501e-05,
"loss": 0.4532,
"step": 970
},
{
"epoch": 0.28891509433962265,
"grad_norm": 0.6306117177009583,
"learning_rate": 9.983303861358461e-05,
"loss": 0.4644,
"step": 980
},
{
"epoch": 0.2918632075471698,
"grad_norm": 0.7826527953147888,
"learning_rate": 9.98292246503335e-05,
"loss": 0.4813,
"step": 990
},
{
"epoch": 0.294811320754717,
"grad_norm": 0.9264414310455322,
"learning_rate": 9.982536769023263e-05,
"loss": 0.4447,
"step": 1000
},
{
"epoch": 0.294811320754717,
"eval_runtime": 2254.5097,
"eval_samples_per_second": 4.013,
"eval_steps_per_second": 0.502,
"step": 1000
},
{
"epoch": 0.2977594339622642,
"grad_norm": 0.7220709919929504,
"learning_rate": 9.982146773661014e-05,
"loss": 0.4569,
"step": 1010
},
{
"epoch": 0.3007075471698113,
"grad_norm": 0.6796172261238098,
"learning_rate": 9.981752479283122e-05,
"loss": 0.4704,
"step": 1020
},
{
"epoch": 0.30365566037735847,
"grad_norm": 0.8216515779495239,
"learning_rate": 9.98135388622982e-05,
"loss": 0.4711,
"step": 1030
},
{
"epoch": 0.30660377358490565,
"grad_norm": 0.7677653431892395,
"learning_rate": 9.980950994845044e-05,
"loss": 0.4945,
"step": 1040
},
{
"epoch": 0.3095518867924528,
"grad_norm": 0.6322248578071594,
"learning_rate": 9.980543805476446e-05,
"loss": 0.4614,
"step": 1050
},
{
"epoch": 0.3125,
"grad_norm": 0.7706162333488464,
"learning_rate": 9.980132318475381e-05,
"loss": 0.4712,
"step": 1060
},
{
"epoch": 0.3154481132075472,
"grad_norm": 0.7900696992874146,
"learning_rate": 9.979716534196917e-05,
"loss": 0.4833,
"step": 1070
},
{
"epoch": 0.31839622641509435,
"grad_norm": 0.7712098956108093,
"learning_rate": 9.979296452999824e-05,
"loss": 0.4744,
"step": 1080
},
{
"epoch": 0.32134433962264153,
"grad_norm": 0.6703490018844604,
"learning_rate": 9.978872075246586e-05,
"loss": 0.4346,
"step": 1090
},
{
"epoch": 0.3242924528301887,
"grad_norm": 0.7551116943359375,
"learning_rate": 9.978443401303392e-05,
"loss": 0.4467,
"step": 1100
},
{
"epoch": 0.3272405660377358,
"grad_norm": 1.0921772718429565,
"learning_rate": 9.978010431540138e-05,
"loss": 0.4821,
"step": 1110
},
{
"epoch": 0.330188679245283,
"grad_norm": 0.98719322681427,
"learning_rate": 9.977573166330426e-05,
"loss": 0.4735,
"step": 1120
},
{
"epoch": 0.3331367924528302,
"grad_norm": 0.7647573947906494,
"learning_rate": 9.977131606051564e-05,
"loss": 0.4523,
"step": 1130
},
{
"epoch": 0.33608490566037735,
"grad_norm": 0.6722393035888672,
"learning_rate": 9.97668575108457e-05,
"loss": 0.4985,
"step": 1140
},
{
"epoch": 0.33903301886792453,
"grad_norm": 0.7893621921539307,
"learning_rate": 9.976235601814163e-05,
"loss": 0.4669,
"step": 1150
},
{
"epoch": 0.3419811320754717,
"grad_norm": 0.954258143901825,
"learning_rate": 9.975781158628772e-05,
"loss": 0.4902,
"step": 1160
},
{
"epoch": 0.3449292452830189,
"grad_norm": 0.7042810320854187,
"learning_rate": 9.975322421920527e-05,
"loss": 0.463,
"step": 1170
},
{
"epoch": 0.34787735849056606,
"grad_norm": 0.8965758681297302,
"learning_rate": 9.974859392085265e-05,
"loss": 0.4432,
"step": 1180
},
{
"epoch": 0.35082547169811323,
"grad_norm": 0.8003450036048889,
"learning_rate": 9.974392069522527e-05,
"loss": 0.4508,
"step": 1190
},
{
"epoch": 0.35377358490566035,
"grad_norm": 0.6887454986572266,
"learning_rate": 9.973920454635559e-05,
"loss": 0.476,
"step": 1200
},
{
"epoch": 0.35672169811320753,
"grad_norm": 0.7032167315483093,
"learning_rate": 9.97344454783131e-05,
"loss": 0.4664,
"step": 1210
},
{
"epoch": 0.3596698113207547,
"grad_norm": 0.8171786665916443,
"learning_rate": 9.97296434952043e-05,
"loss": 0.4617,
"step": 1220
},
{
"epoch": 0.3626179245283019,
"grad_norm": 1.0460634231567383,
"learning_rate": 9.972479860117279e-05,
"loss": 0.4556,
"step": 1230
},
{
"epoch": 0.36556603773584906,
"grad_norm": 0.8791403770446777,
"learning_rate": 9.97199108003991e-05,
"loss": 0.4339,
"step": 1240
},
{
"epoch": 0.36851415094339623,
"grad_norm": 0.7250672578811646,
"learning_rate": 9.971498009710088e-05,
"loss": 0.4784,
"step": 1250
},
{
"epoch": 0.3714622641509434,
"grad_norm": 0.767308235168457,
"learning_rate": 9.971000649553274e-05,
"loss": 0.4622,
"step": 1260
},
{
"epoch": 0.3744103773584906,
"grad_norm": 0.7120485901832581,
"learning_rate": 9.970498999998632e-05,
"loss": 0.4463,
"step": 1270
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.8479111194610596,
"learning_rate": 9.969993061479028e-05,
"loss": 0.4801,
"step": 1280
},
{
"epoch": 0.3803066037735849,
"grad_norm": 1.2430691719055176,
"learning_rate": 9.969482834431027e-05,
"loss": 0.4919,
"step": 1290
},
{
"epoch": 0.38325471698113206,
"grad_norm": 0.7112911939620972,
"learning_rate": 9.968968319294896e-05,
"loss": 0.486,
"step": 1300
},
{
"epoch": 0.38620283018867924,
"grad_norm": 0.822472870349884,
"learning_rate": 9.968449516514606e-05,
"loss": 0.4594,
"step": 1310
},
{
"epoch": 0.3891509433962264,
"grad_norm": 0.8960035443305969,
"learning_rate": 9.967926426537817e-05,
"loss": 0.4775,
"step": 1320
},
{
"epoch": 0.3920990566037736,
"grad_norm": 0.6983857750892639,
"learning_rate": 9.9673990498159e-05,
"loss": 0.4692,
"step": 1330
},
{
"epoch": 0.39504716981132076,
"grad_norm": 0.7807797193527222,
"learning_rate": 9.966867386803919e-05,
"loss": 0.4715,
"step": 1340
},
{
"epoch": 0.39799528301886794,
"grad_norm": 1.1104581356048584,
"learning_rate": 9.966331437960637e-05,
"loss": 0.4237,
"step": 1350
},
{
"epoch": 0.4009433962264151,
"grad_norm": 0.7249336242675781,
"learning_rate": 9.965791203748515e-05,
"loss": 0.4544,
"step": 1360
},
{
"epoch": 0.40389150943396224,
"grad_norm": 0.652172327041626,
"learning_rate": 9.965246684633716e-05,
"loss": 0.4747,
"step": 1370
},
{
"epoch": 0.4068396226415094,
"grad_norm": 0.6301229000091553,
"learning_rate": 9.964697881086091e-05,
"loss": 0.4321,
"step": 1380
},
{
"epoch": 0.4097877358490566,
"grad_norm": 0.7372440695762634,
"learning_rate": 9.9641447935792e-05,
"loss": 0.4606,
"step": 1390
},
{
"epoch": 0.41273584905660377,
"grad_norm": 0.5712461471557617,
"learning_rate": 9.963587422590291e-05,
"loss": 0.4439,
"step": 1400
},
{
"epoch": 0.41568396226415094,
"grad_norm": 0.7567596435546875,
"learning_rate": 9.963025768600309e-05,
"loss": 0.4467,
"step": 1410
},
{
"epoch": 0.4186320754716981,
"grad_norm": 0.621815025806427,
"learning_rate": 9.962459832093898e-05,
"loss": 0.4434,
"step": 1420
},
{
"epoch": 0.4215801886792453,
"grad_norm": 0.8316141963005066,
"learning_rate": 9.961889613559395e-05,
"loss": 0.4917,
"step": 1430
},
{
"epoch": 0.42452830188679247,
"grad_norm": 0.6534755229949951,
"learning_rate": 9.961315113488833e-05,
"loss": 0.4698,
"step": 1440
},
{
"epoch": 0.42747641509433965,
"grad_norm": 0.6665485501289368,
"learning_rate": 9.96073633237794e-05,
"loss": 0.4338,
"step": 1450
},
{
"epoch": 0.43042452830188677,
"grad_norm": 0.5755445957183838,
"learning_rate": 9.960153270726136e-05,
"loss": 0.4517,
"step": 1460
},
{
"epoch": 0.43337264150943394,
"grad_norm": 0.7612144351005554,
"learning_rate": 9.959565929036537e-05,
"loss": 0.4552,
"step": 1470
},
{
"epoch": 0.4363207547169811,
"grad_norm": 0.6562482714653015,
"learning_rate": 9.958974307815947e-05,
"loss": 0.454,
"step": 1480
},
{
"epoch": 0.4392688679245283,
"grad_norm": 0.7845900058746338,
"learning_rate": 9.95837840757487e-05,
"loss": 0.4443,
"step": 1490
},
{
"epoch": 0.44221698113207547,
"grad_norm": 0.5814388990402222,
"learning_rate": 9.9577782288275e-05,
"loss": 0.449,
"step": 1500
},
{
"epoch": 0.44516509433962265,
"grad_norm": 0.6388370990753174,
"learning_rate": 9.957173772091716e-05,
"loss": 0.4515,
"step": 1510
},
{
"epoch": 0.4481132075471698,
"grad_norm": 0.5995022058486938,
"learning_rate": 9.9565650378891e-05,
"loss": 0.4526,
"step": 1520
},
{
"epoch": 0.451061320754717,
"grad_norm": 0.648541271686554,
"learning_rate": 9.955952026744919e-05,
"loss": 0.4741,
"step": 1530
},
{
"epoch": 0.4540094339622642,
"grad_norm": 0.7231540679931641,
"learning_rate": 9.955334739188125e-05,
"loss": 0.4718,
"step": 1540
},
{
"epoch": 0.4569575471698113,
"grad_norm": 0.81379234790802,
"learning_rate": 9.954713175751373e-05,
"loss": 0.4996,
"step": 1550
},
{
"epoch": 0.45990566037735847,
"grad_norm": 0.5860093235969543,
"learning_rate": 9.954087336970994e-05,
"loss": 0.4643,
"step": 1560
},
{
"epoch": 0.46285377358490565,
"grad_norm": 0.736876904964447,
"learning_rate": 9.953457223387018e-05,
"loss": 0.4792,
"step": 1570
},
{
"epoch": 0.4658018867924528,
"grad_norm": 0.6783494353294373,
"learning_rate": 9.952822835543158e-05,
"loss": 0.4537,
"step": 1580
},
{
"epoch": 0.46875,
"grad_norm": 0.7174587845802307,
"learning_rate": 9.952184173986821e-05,
"loss": 0.4448,
"step": 1590
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.8119440674781799,
"learning_rate": 9.951541239269093e-05,
"loss": 0.4435,
"step": 1600
},
{
"epoch": 0.47464622641509435,
"grad_norm": 0.7257644534111023,
"learning_rate": 9.950894031944755e-05,
"loss": 0.4666,
"step": 1610
},
{
"epoch": 0.47759433962264153,
"grad_norm": 0.5853690505027771,
"learning_rate": 9.950242552572271e-05,
"loss": 0.4809,
"step": 1620
},
{
"epoch": 0.4805424528301887,
"grad_norm": 0.8147138953208923,
"learning_rate": 9.949586801713795e-05,
"loss": 0.436,
"step": 1630
},
{
"epoch": 0.4834905660377358,
"grad_norm": 0.649327278137207,
"learning_rate": 9.948926779935159e-05,
"loss": 0.4294,
"step": 1640
},
{
"epoch": 0.486438679245283,
"grad_norm": 0.6641764640808105,
"learning_rate": 9.948262487805889e-05,
"loss": 0.4576,
"step": 1650
},
{
"epoch": 0.4893867924528302,
"grad_norm": 0.6133862733840942,
"learning_rate": 9.947593925899192e-05,
"loss": 0.4435,
"step": 1660
},
{
"epoch": 0.49233490566037735,
"grad_norm": 0.6970853805541992,
"learning_rate": 9.946921094791958e-05,
"loss": 0.4381,
"step": 1670
},
{
"epoch": 0.49528301886792453,
"grad_norm": 0.8421550989151001,
"learning_rate": 9.946243995064764e-05,
"loss": 0.4406,
"step": 1680
},
{
"epoch": 0.4982311320754717,
"grad_norm": 1.3746379613876343,
"learning_rate": 9.945562627301865e-05,
"loss": 0.4654,
"step": 1690
},
{
"epoch": 0.5011792452830188,
"grad_norm": 0.743859589099884,
"learning_rate": 9.944876992091207e-05,
"loss": 0.4607,
"step": 1700
},
{
"epoch": 0.504127358490566,
"grad_norm": 0.5599111318588257,
"learning_rate": 9.944187090024413e-05,
"loss": 0.4719,
"step": 1710
},
{
"epoch": 0.5070754716981132,
"grad_norm": 0.6559354662895203,
"learning_rate": 9.943492921696787e-05,
"loss": 0.4629,
"step": 1720
},
{
"epoch": 0.5100235849056604,
"grad_norm": 0.6211720705032349,
"learning_rate": 9.942794487707314e-05,
"loss": 0.4571,
"step": 1730
},
{
"epoch": 0.5129716981132075,
"grad_norm": 0.735253095626831,
"learning_rate": 9.942091788658668e-05,
"loss": 0.4446,
"step": 1740
},
{
"epoch": 0.5159198113207547,
"grad_norm": 0.7313670516014099,
"learning_rate": 9.94138482515719e-05,
"loss": 0.46,
"step": 1750
},
{
"epoch": 0.5188679245283019,
"grad_norm": 0.7807812094688416,
"learning_rate": 9.940673597812911e-05,
"loss": 0.4518,
"step": 1760
},
{
"epoch": 0.5218160377358491,
"grad_norm": 0.6716724634170532,
"learning_rate": 9.939958107239537e-05,
"loss": 0.4329,
"step": 1770
},
{
"epoch": 0.5247641509433962,
"grad_norm": 0.5976256132125854,
"learning_rate": 9.939238354054454e-05,
"loss": 0.4479,
"step": 1780
},
{
"epoch": 0.5277122641509434,
"grad_norm": 0.7849621772766113,
"learning_rate": 9.938514338878726e-05,
"loss": 0.4482,
"step": 1790
},
{
"epoch": 0.5306603773584906,
"grad_norm": 0.7628201246261597,
"learning_rate": 9.937786062337094e-05,
"loss": 0.4585,
"step": 1800
},
{
"epoch": 0.5336084905660378,
"grad_norm": 0.6475509405136108,
"learning_rate": 9.937053525057977e-05,
"loss": 0.459,
"step": 1810
},
{
"epoch": 0.5365566037735849,
"grad_norm": 0.6419079899787903,
"learning_rate": 9.936316727673466e-05,
"loss": 0.4562,
"step": 1820
},
{
"epoch": 0.5395047169811321,
"grad_norm": 0.5959088206291199,
"learning_rate": 9.935575670819337e-05,
"loss": 0.4569,
"step": 1830
},
{
"epoch": 0.5424528301886793,
"grad_norm": 0.5781623125076294,
"learning_rate": 9.934830355135034e-05,
"loss": 0.4581,
"step": 1840
},
{
"epoch": 0.5454009433962265,
"grad_norm": 0.8665578365325928,
"learning_rate": 9.934080781263678e-05,
"loss": 0.4558,
"step": 1850
},
{
"epoch": 0.5483490566037735,
"grad_norm": 0.6500059366226196,
"learning_rate": 9.933326949852063e-05,
"loss": 0.4352,
"step": 1860
},
{
"epoch": 0.5512971698113207,
"grad_norm": 0.6708106994628906,
"learning_rate": 9.93256886155066e-05,
"loss": 0.4841,
"step": 1870
},
{
"epoch": 0.5542452830188679,
"grad_norm": 0.6584358215332031,
"learning_rate": 9.931806517013612e-05,
"loss": 0.4339,
"step": 1880
},
{
"epoch": 0.5571933962264151,
"grad_norm": 0.6990833878517151,
"learning_rate": 9.931039916898733e-05,
"loss": 0.4816,
"step": 1890
},
{
"epoch": 0.5601415094339622,
"grad_norm": 0.5763950943946838,
"learning_rate": 9.93026906186751e-05,
"loss": 0.4365,
"step": 1900
},
{
"epoch": 0.5630896226415094,
"grad_norm": 0.7058366537094116,
"learning_rate": 9.929493952585103e-05,
"loss": 0.4587,
"step": 1910
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.745461106300354,
"learning_rate": 9.928714589720338e-05,
"loss": 0.4636,
"step": 1920
},
{
"epoch": 0.5689858490566038,
"grad_norm": 0.6560072302818298,
"learning_rate": 9.927930973945718e-05,
"loss": 0.4366,
"step": 1930
},
{
"epoch": 0.5719339622641509,
"grad_norm": 0.617240309715271,
"learning_rate": 9.927143105937413e-05,
"loss": 0.4462,
"step": 1940
},
{
"epoch": 0.5748820754716981,
"grad_norm": 0.7028854489326477,
"learning_rate": 9.926350986375262e-05,
"loss": 0.4343,
"step": 1950
},
{
"epoch": 0.5778301886792453,
"grad_norm": 0.7185344696044922,
"learning_rate": 9.925554615942769e-05,
"loss": 0.4372,
"step": 1960
},
{
"epoch": 0.5807783018867925,
"grad_norm": 0.695959746837616,
"learning_rate": 9.924753995327112e-05,
"loss": 0.4639,
"step": 1970
},
{
"epoch": 0.5837264150943396,
"grad_norm": 0.6245220303535461,
"learning_rate": 9.923949125219133e-05,
"loss": 0.4421,
"step": 1980
},
{
"epoch": 0.5866745283018868,
"grad_norm": 0.8171836733818054,
"learning_rate": 9.923140006313343e-05,
"loss": 0.4443,
"step": 1990
},
{
"epoch": 0.589622641509434,
"grad_norm": 1.073494553565979,
"learning_rate": 9.922326639307917e-05,
"loss": 0.4704,
"step": 2000
},
{
"epoch": 0.589622641509434,
"eval_runtime": 2260.9909,
"eval_samples_per_second": 4.001,
"eval_steps_per_second": 0.5,
"step": 2000
},
{
"epoch": 0.5925707547169812,
"grad_norm": 0.6787593364715576,
"learning_rate": 9.921509024904696e-05,
"loss": 0.4415,
"step": 2010
},
{
"epoch": 0.5955188679245284,
"grad_norm": 0.7950299382209778,
"learning_rate": 9.920687163809188e-05,
"loss": 0.4794,
"step": 2020
},
{
"epoch": 0.5984669811320755,
"grad_norm": 0.5931691527366638,
"learning_rate": 9.919861056730564e-05,
"loss": 0.4497,
"step": 2030
},
{
"epoch": 0.6014150943396226,
"grad_norm": 1.0852645635604858,
"learning_rate": 9.919030704381656e-05,
"loss": 0.4614,
"step": 2040
},
{
"epoch": 0.6043632075471698,
"grad_norm": 0.6023767590522766,
"learning_rate": 9.918196107478966e-05,
"loss": 0.4595,
"step": 2050
},
{
"epoch": 0.6073113207547169,
"grad_norm": 0.5358297228813171,
"learning_rate": 9.917357266742651e-05,
"loss": 0.4205,
"step": 2060
},
{
"epoch": 0.6102594339622641,
"grad_norm": 0.6182430982589722,
"learning_rate": 9.916514182896534e-05,
"loss": 0.4506,
"step": 2070
},
{
"epoch": 0.6132075471698113,
"grad_norm": 0.6333853602409363,
"learning_rate": 9.9156668566681e-05,
"loss": 0.4323,
"step": 2080
},
{
"epoch": 0.6161556603773585,
"grad_norm": 0.5656896829605103,
"learning_rate": 9.914815288788492e-05,
"loss": 0.4408,
"step": 2090
},
{
"epoch": 0.6191037735849056,
"grad_norm": 0.7784629464149475,
"learning_rate": 9.913959479992516e-05,
"loss": 0.4392,
"step": 2100
},
{
"epoch": 0.6220518867924528,
"grad_norm": 0.6313180327415466,
"learning_rate": 9.913099431018636e-05,
"loss": 0.4493,
"step": 2110
},
{
"epoch": 0.625,
"grad_norm": 0.7768145799636841,
"learning_rate": 9.912235142608972e-05,
"loss": 0.4359,
"step": 2120
},
{
"epoch": 0.6279481132075472,
"grad_norm": 0.6603013873100281,
"learning_rate": 9.911366615509305e-05,
"loss": 0.4372,
"step": 2130
},
{
"epoch": 0.6308962264150944,
"grad_norm": 0.6187661290168762,
"learning_rate": 9.910493850469078e-05,
"loss": 0.4326,
"step": 2140
},
{
"epoch": 0.6338443396226415,
"grad_norm": 0.6031427979469299,
"learning_rate": 9.909616848241383e-05,
"loss": 0.4424,
"step": 2150
},
{
"epoch": 0.6367924528301887,
"grad_norm": 0.888024628162384,
"learning_rate": 9.908735609582968e-05,
"loss": 0.4479,
"step": 2160
},
{
"epoch": 0.6397405660377359,
"grad_norm": 0.5860321521759033,
"learning_rate": 9.907850135254246e-05,
"loss": 0.4408,
"step": 2170
},
{
"epoch": 0.6426886792452831,
"grad_norm": 0.63505619764328,
"learning_rate": 9.906960426019275e-05,
"loss": 0.4334,
"step": 2180
},
{
"epoch": 0.6456367924528302,
"grad_norm": 0.7878406047821045,
"learning_rate": 9.906066482645772e-05,
"loss": 0.476,
"step": 2190
},
{
"epoch": 0.6485849056603774,
"grad_norm": 0.7122024893760681,
"learning_rate": 9.905168305905108e-05,
"loss": 0.4364,
"step": 2200
},
{
"epoch": 0.6515330188679245,
"grad_norm": 0.7257914543151855,
"learning_rate": 9.904265896572303e-05,
"loss": 0.4444,
"step": 2210
},
{
"epoch": 0.6544811320754716,
"grad_norm": 0.5779203176498413,
"learning_rate": 9.903359255426034e-05,
"loss": 0.429,
"step": 2220
},
{
"epoch": 0.6574292452830188,
"grad_norm": 0.6193981170654297,
"learning_rate": 9.902448383248625e-05,
"loss": 0.439,
"step": 2230
},
{
"epoch": 0.660377358490566,
"grad_norm": 0.7393147349357605,
"learning_rate": 9.901533280826054e-05,
"loss": 0.4488,
"step": 2240
},
{
"epoch": 0.6633254716981132,
"grad_norm": 0.7435190677642822,
"learning_rate": 9.90061394894795e-05,
"loss": 0.4136,
"step": 2250
},
{
"epoch": 0.6662735849056604,
"grad_norm": 0.6783705353736877,
"learning_rate": 9.899690388407588e-05,
"loss": 0.4608,
"step": 2260
},
{
"epoch": 0.6692216981132075,
"grad_norm": 0.5903108716011047,
"learning_rate": 9.898762600001894e-05,
"loss": 0.4476,
"step": 2270
},
{
"epoch": 0.6721698113207547,
"grad_norm": 0.8416420817375183,
"learning_rate": 9.897830584531442e-05,
"loss": 0.4359,
"step": 2280
},
{
"epoch": 0.6751179245283019,
"grad_norm": 0.636867880821228,
"learning_rate": 9.896894342800456e-05,
"loss": 0.4507,
"step": 2290
},
{
"epoch": 0.6780660377358491,
"grad_norm": 0.6105553507804871,
"learning_rate": 9.8959538756168e-05,
"loss": 0.4383,
"step": 2300
},
{
"epoch": 0.6810141509433962,
"grad_norm": 0.8191878199577332,
"learning_rate": 9.895009183791991e-05,
"loss": 0.4299,
"step": 2310
},
{
"epoch": 0.6839622641509434,
"grad_norm": 0.5651885271072388,
"learning_rate": 9.894060268141188e-05,
"loss": 0.4312,
"step": 2320
},
{
"epoch": 0.6869103773584906,
"grad_norm": 0.6905661821365356,
"learning_rate": 9.893107129483195e-05,
"loss": 0.4436,
"step": 2330
},
{
"epoch": 0.6898584905660378,
"grad_norm": 0.7586243748664856,
"learning_rate": 9.89214976864046e-05,
"loss": 0.4577,
"step": 2340
},
{
"epoch": 0.6928066037735849,
"grad_norm": 0.8332115411758423,
"learning_rate": 9.891188186439076e-05,
"loss": 0.453,
"step": 2350
},
{
"epoch": 0.6957547169811321,
"grad_norm": 0.6803733706474304,
"learning_rate": 9.890222383708776e-05,
"loss": 0.4247,
"step": 2360
},
{
"epoch": 0.6987028301886793,
"grad_norm": 0.7532495856285095,
"learning_rate": 9.889252361282935e-05,
"loss": 0.426,
"step": 2370
},
{
"epoch": 0.7016509433962265,
"grad_norm": 0.7318875193595886,
"learning_rate": 9.888278119998573e-05,
"loss": 0.4641,
"step": 2380
},
{
"epoch": 0.7045990566037735,
"grad_norm": 0.6257642507553101,
"learning_rate": 9.887299660696343e-05,
"loss": 0.4245,
"step": 2390
},
{
"epoch": 0.7075471698113207,
"grad_norm": 0.6812471151351929,
"learning_rate": 9.886316984220546e-05,
"loss": 0.4186,
"step": 2400
},
{
"epoch": 0.7104952830188679,
"grad_norm": 0.6753676533699036,
"learning_rate": 9.885330091419116e-05,
"loss": 0.4505,
"step": 2410
},
{
"epoch": 0.7134433962264151,
"grad_norm": 0.7901769876480103,
"learning_rate": 9.884338983143627e-05,
"loss": 0.4346,
"step": 2420
},
{
"epoch": 0.7163915094339622,
"grad_norm": 0.6888976693153381,
"learning_rate": 9.883343660249291e-05,
"loss": 0.4452,
"step": 2430
},
{
"epoch": 0.7193396226415094,
"grad_norm": 0.7051608562469482,
"learning_rate": 9.882344123594958e-05,
"loss": 0.4363,
"step": 2440
},
{
"epoch": 0.7222877358490566,
"grad_norm": 0.682527482509613,
"learning_rate": 9.88134037404311e-05,
"loss": 0.4231,
"step": 2450
},
{
"epoch": 0.7252358490566038,
"grad_norm": 0.6059719920158386,
"learning_rate": 9.880332412459868e-05,
"loss": 0.4113,
"step": 2460
},
{
"epoch": 0.7281839622641509,
"grad_norm": 0.6270197629928589,
"learning_rate": 9.879320239714986e-05,
"loss": 0.4291,
"step": 2470
},
{
"epoch": 0.7311320754716981,
"grad_norm": 0.7043851613998413,
"learning_rate": 9.878303856681851e-05,
"loss": 0.4624,
"step": 2480
},
{
"epoch": 0.7340801886792453,
"grad_norm": 0.5212409496307373,
"learning_rate": 9.877283264237484e-05,
"loss": 0.4242,
"step": 2490
},
{
"epoch": 0.7370283018867925,
"grad_norm": 0.7234752774238586,
"learning_rate": 9.876258463262539e-05,
"loss": 0.4314,
"step": 2500
},
{
"epoch": 0.7399764150943396,
"grad_norm": 0.5897389054298401,
"learning_rate": 9.875229454641301e-05,
"loss": 0.5093,
"step": 2510
},
{
"epoch": 0.7429245283018868,
"grad_norm": 0.7722876667976379,
"learning_rate": 9.874196239261683e-05,
"loss": 0.444,
"step": 2520
},
{
"epoch": 0.745872641509434,
"grad_norm": 0.7034250497817993,
"learning_rate": 9.873158818015233e-05,
"loss": 0.4883,
"step": 2530
},
{
"epoch": 0.7488207547169812,
"grad_norm": 0.6282274127006531,
"learning_rate": 9.872117191797122e-05,
"loss": 0.4388,
"step": 2540
},
{
"epoch": 0.7517688679245284,
"grad_norm": 0.6394955515861511,
"learning_rate": 9.871071361506156e-05,
"loss": 0.4781,
"step": 2550
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.6548972725868225,
"learning_rate": 9.870021328044762e-05,
"loss": 0.4355,
"step": 2560
},
{
"epoch": 0.7576650943396226,
"grad_norm": 0.8458198308944702,
"learning_rate": 9.868967092319003e-05,
"loss": 0.4704,
"step": 2570
},
{
"epoch": 0.7606132075471698,
"grad_norm": 0.6317086219787598,
"learning_rate": 9.867908655238556e-05,
"loss": 0.434,
"step": 2580
},
{
"epoch": 0.7635613207547169,
"grad_norm": 0.6734662652015686,
"learning_rate": 9.866846017716734e-05,
"loss": 0.4654,
"step": 2590
},
{
"epoch": 0.7665094339622641,
"grad_norm": 0.6610127687454224,
"learning_rate": 9.865779180670466e-05,
"loss": 0.4887,
"step": 2600
},
{
"epoch": 0.7694575471698113,
"grad_norm": 0.5381469130516052,
"learning_rate": 9.864708145020314e-05,
"loss": 0.4296,
"step": 2610
},
{
"epoch": 0.7724056603773585,
"grad_norm": 0.6565837860107422,
"learning_rate": 9.863632911690453e-05,
"loss": 0.4527,
"step": 2620
},
{
"epoch": 0.7753537735849056,
"grad_norm": 0.6266703605651855,
"learning_rate": 9.862553481608687e-05,
"loss": 0.4501,
"step": 2630
},
{
"epoch": 0.7783018867924528,
"grad_norm": 0.6772686243057251,
"learning_rate": 9.86146985570644e-05,
"loss": 0.4519,
"step": 2640
},
{
"epoch": 0.78125,
"grad_norm": 0.5664348602294922,
"learning_rate": 9.860382034918754e-05,
"loss": 0.4498,
"step": 2650
},
{
"epoch": 0.7841981132075472,
"grad_norm": 0.693162202835083,
"learning_rate": 9.859290020184293e-05,
"loss": 0.4448,
"step": 2660
},
{
"epoch": 0.7871462264150944,
"grad_norm": 0.6841190457344055,
"learning_rate": 9.858193812445337e-05,
"loss": 0.4034,
"step": 2670
},
{
"epoch": 0.7900943396226415,
"grad_norm": 0.8030083775520325,
"learning_rate": 9.857093412647791e-05,
"loss": 0.4628,
"step": 2680
},
{
"epoch": 0.7930424528301887,
"grad_norm": 0.5943866968154907,
"learning_rate": 9.855988821741169e-05,
"loss": 0.443,
"step": 2690
},
{
"epoch": 0.7959905660377359,
"grad_norm": 0.670657217502594,
"learning_rate": 9.854880040678606e-05,
"loss": 0.4218,
"step": 2700
},
{
"epoch": 0.7989386792452831,
"grad_norm": 0.5988708734512329,
"learning_rate": 9.853767070416852e-05,
"loss": 0.4394,
"step": 2710
},
{
"epoch": 0.8018867924528302,
"grad_norm": 0.7311452627182007,
"learning_rate": 9.852649911916272e-05,
"loss": 0.4396,
"step": 2720
},
{
"epoch": 0.8048349056603774,
"grad_norm": 0.747868537902832,
"learning_rate": 9.851528566140844e-05,
"loss": 0.4119,
"step": 2730
},
{
"epoch": 0.8077830188679245,
"grad_norm": 0.7394080758094788,
"learning_rate": 9.850403034058157e-05,
"loss": 0.4728,
"step": 2740
},
{
"epoch": 0.8107311320754716,
"grad_norm": 0.6365045309066772,
"learning_rate": 9.849273316639418e-05,
"loss": 0.4271,
"step": 2750
},
{
"epoch": 0.8136792452830188,
"grad_norm": 0.7559402585029602,
"learning_rate": 9.848139414859441e-05,
"loss": 0.4273,
"step": 2760
},
{
"epoch": 0.816627358490566,
"grad_norm": 0.7403167486190796,
"learning_rate": 9.847001329696653e-05,
"loss": 0.4466,
"step": 2770
},
{
"epoch": 0.8195754716981132,
"grad_norm": 0.6549522280693054,
"learning_rate": 9.845859062133087e-05,
"loss": 0.4251,
"step": 2780
},
{
"epoch": 0.8225235849056604,
"grad_norm": 0.7104706764221191,
"learning_rate": 9.84471261315439e-05,
"loss": 0.461,
"step": 2790
},
{
"epoch": 0.8254716981132075,
"grad_norm": 0.6579510569572449,
"learning_rate": 9.843561983749816e-05,
"loss": 0.4425,
"step": 2800
},
{
"epoch": 0.8284198113207547,
"grad_norm": 0.6831321716308594,
"learning_rate": 9.84240717491222e-05,
"loss": 0.4293,
"step": 2810
},
{
"epoch": 0.8313679245283019,
"grad_norm": 0.7022287249565125,
"learning_rate": 9.841248187638074e-05,
"loss": 0.4424,
"step": 2820
},
{
"epoch": 0.8343160377358491,
"grad_norm": 0.5860363245010376,
"learning_rate": 9.840085022927446e-05,
"loss": 0.4304,
"step": 2830
},
{
"epoch": 0.8372641509433962,
"grad_norm": 0.6918849945068359,
"learning_rate": 9.838917681784012e-05,
"loss": 0.4194,
"step": 2840
},
{
"epoch": 0.8402122641509434,
"grad_norm": 0.648749828338623,
"learning_rate": 9.837746165215056e-05,
"loss": 0.4354,
"step": 2850
},
{
"epoch": 0.8431603773584906,
"grad_norm": 0.7720881700515747,
"learning_rate": 9.836570474231458e-05,
"loss": 0.4145,
"step": 2860
},
{
"epoch": 0.8461084905660378,
"grad_norm": 0.530616044998169,
"learning_rate": 9.835390609847704e-05,
"loss": 0.4461,
"step": 2870
},
{
"epoch": 0.8490566037735849,
"grad_norm": 0.47622278332710266,
"learning_rate": 9.83420657308188e-05,
"loss": 0.4662,
"step": 2880
},
{
"epoch": 0.8520047169811321,
"grad_norm": 0.631523072719574,
"learning_rate": 9.833018364955673e-05,
"loss": 0.4393,
"step": 2890
},
{
"epoch": 0.8549528301886793,
"grad_norm": 0.5349221229553223,
"learning_rate": 9.83182598649437e-05,
"loss": 0.4252,
"step": 2900
},
{
"epoch": 0.8579009433962265,
"grad_norm": 0.7066219449043274,
"learning_rate": 9.830629438726853e-05,
"loss": 0.4414,
"step": 2910
},
{
"epoch": 0.8608490566037735,
"grad_norm": 0.6091190576553345,
"learning_rate": 9.829428722685605e-05,
"loss": 0.4456,
"step": 2920
},
{
"epoch": 0.8637971698113207,
"grad_norm": 0.586168646812439,
"learning_rate": 9.828223839406707e-05,
"loss": 0.4089,
"step": 2930
},
{
"epoch": 0.8667452830188679,
"grad_norm": 0.5532593727111816,
"learning_rate": 9.827014789929831e-05,
"loss": 0.4585,
"step": 2940
},
{
"epoch": 0.8696933962264151,
"grad_norm": 0.599045991897583,
"learning_rate": 9.825801575298248e-05,
"loss": 0.4369,
"step": 2950
},
{
"epoch": 0.8726415094339622,
"grad_norm": 0.6460824608802795,
"learning_rate": 9.824584196558821e-05,
"loss": 0.4554,
"step": 2960
},
{
"epoch": 0.8755896226415094,
"grad_norm": 0.6909619569778442,
"learning_rate": 9.82336265476201e-05,
"loss": 0.4187,
"step": 2970
},
{
"epoch": 0.8785377358490566,
"grad_norm": 0.6244730949401855,
"learning_rate": 9.822136950961859e-05,
"loss": 0.4402,
"step": 2980
},
{
"epoch": 0.8814858490566038,
"grad_norm": 0.6286002397537231,
"learning_rate": 9.820907086216011e-05,
"loss": 0.4343,
"step": 2990
},
{
"epoch": 0.8844339622641509,
"grad_norm": 0.6675744652748108,
"learning_rate": 9.819673061585698e-05,
"loss": 0.4322,
"step": 3000
},
{
"epoch": 0.8844339622641509,
"eval_runtime": 2152.2698,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.525,
"step": 3000
},
{
"epoch": 0.8873820754716981,
"grad_norm": 0.7260634899139404,
"learning_rate": 9.818434878135739e-05,
"loss": 0.4637,
"step": 3010
},
{
"epoch": 0.8903301886792453,
"grad_norm": 0.7156859636306763,
"learning_rate": 9.81719253693454e-05,
"loss": 0.4392,
"step": 3020
},
{
"epoch": 0.8932783018867925,
"grad_norm": 0.6953458786010742,
"learning_rate": 9.815946039054105e-05,
"loss": 0.4102,
"step": 3030
},
{
"epoch": 0.8962264150943396,
"grad_norm": 0.6781195998191833,
"learning_rate": 9.814695385570009e-05,
"loss": 0.4643,
"step": 3040
},
{
"epoch": 0.8991745283018868,
"grad_norm": 0.772041380405426,
"learning_rate": 9.813440577561427e-05,
"loss": 0.4446,
"step": 3050
},
{
"epoch": 0.902122641509434,
"grad_norm": 0.7084837555885315,
"learning_rate": 9.812181616111111e-05,
"loss": 0.4524,
"step": 3060
},
{
"epoch": 0.9050707547169812,
"grad_norm": 0.5128844976425171,
"learning_rate": 9.810918502305399e-05,
"loss": 0.4728,
"step": 3070
},
{
"epoch": 0.9080188679245284,
"grad_norm": 0.7452048659324646,
"learning_rate": 9.809651237234211e-05,
"loss": 0.4517,
"step": 3080
},
{
"epoch": 0.9109669811320755,
"grad_norm": 0.746753454208374,
"learning_rate": 9.80837982199105e-05,
"loss": 0.4509,
"step": 3090
},
{
"epoch": 0.9139150943396226,
"grad_norm": 0.6032639741897583,
"learning_rate": 9.807104257673003e-05,
"loss": 0.4406,
"step": 3100
},
{
"epoch": 0.9168632075471698,
"grad_norm": 1.1121796369552612,
"learning_rate": 9.80582454538073e-05,
"loss": 0.447,
"step": 3110
},
{
"epoch": 0.9198113207547169,
"grad_norm": 0.4976518750190735,
"learning_rate": 9.804540686218477e-05,
"loss": 0.4178,
"step": 3120
},
{
"epoch": 0.9227594339622641,
"grad_norm": 0.6110193729400635,
"learning_rate": 9.803252681294067e-05,
"loss": 0.4517,
"step": 3130
},
{
"epoch": 0.9257075471698113,
"grad_norm": 0.674140453338623,
"learning_rate": 9.801960531718896e-05,
"loss": 0.4464,
"step": 3140
},
{
"epoch": 0.9286556603773585,
"grad_norm": 0.6629025936126709,
"learning_rate": 9.800664238607941e-05,
"loss": 0.4309,
"step": 3150
},
{
"epoch": 0.9316037735849056,
"grad_norm": 0.7753364443778992,
"learning_rate": 9.799363803079754e-05,
"loss": 0.434,
"step": 3160
},
{
"epoch": 0.9345518867924528,
"grad_norm": 0.8927829265594482,
"learning_rate": 9.798059226256459e-05,
"loss": 0.4532,
"step": 3170
},
{
"epoch": 0.9375,
"grad_norm": 0.7625536918640137,
"learning_rate": 9.796750509263752e-05,
"loss": 0.4583,
"step": 3180
},
{
"epoch": 0.9404481132075472,
"grad_norm": 0.6334768533706665,
"learning_rate": 9.79543765323091e-05,
"loss": 0.4421,
"step": 3190
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.663287341594696,
"learning_rate": 9.794120659290771e-05,
"loss": 0.4379,
"step": 3200
},
{
"epoch": 0.9463443396226415,
"grad_norm": 0.7161186933517456,
"learning_rate": 9.79279952857975e-05,
"loss": 0.4587,
"step": 3210
},
{
"epoch": 0.9492924528301887,
"grad_norm": 0.722528874874115,
"learning_rate": 9.79147426223783e-05,
"loss": 0.4433,
"step": 3220
},
{
"epoch": 0.9522405660377359,
"grad_norm": 1.149199366569519,
"learning_rate": 9.790144861408561e-05,
"loss": 0.4356,
"step": 3230
},
{
"epoch": 0.9551886792452831,
"grad_norm": 0.6969400644302368,
"learning_rate": 9.788811327239064e-05,
"loss": 0.4327,
"step": 3240
},
{
"epoch": 0.9581367924528302,
"grad_norm": 0.6814727783203125,
"learning_rate": 9.787473660880022e-05,
"loss": 0.4121,
"step": 3250
},
{
"epoch": 0.9610849056603774,
"grad_norm": 0.7098613381385803,
"learning_rate": 9.786131863485689e-05,
"loss": 0.4269,
"step": 3260
},
{
"epoch": 0.9640330188679245,
"grad_norm": 0.6246148347854614,
"learning_rate": 9.78478593621388e-05,
"loss": 0.414,
"step": 3270
},
{
"epoch": 0.9669811320754716,
"grad_norm": 0.7819629907608032,
"learning_rate": 9.783435880225971e-05,
"loss": 0.4012,
"step": 3280
},
{
"epoch": 0.9699292452830188,
"grad_norm": 0.6134668588638306,
"learning_rate": 9.782081696686908e-05,
"loss": 0.4583,
"step": 3290
},
{
"epoch": 0.972877358490566,
"grad_norm": 0.5895134806632996,
"learning_rate": 9.780723386765194e-05,
"loss": 0.446,
"step": 3300
},
{
"epoch": 0.9758254716981132,
"grad_norm": 0.644167423248291,
"learning_rate": 9.779360951632892e-05,
"loss": 0.4106,
"step": 3310
},
{
"epoch": 0.9787735849056604,
"grad_norm": 0.47903046011924744,
"learning_rate": 9.777994392465625e-05,
"loss": 0.4281,
"step": 3320
},
{
"epoch": 0.9817216981132075,
"grad_norm": 0.6636764407157898,
"learning_rate": 9.776623710442579e-05,
"loss": 0.4521,
"step": 3330
},
{
"epoch": 0.9846698113207547,
"grad_norm": 0.6923460960388184,
"learning_rate": 9.775248906746488e-05,
"loss": 0.4308,
"step": 3340
},
{
"epoch": 0.9876179245283019,
"grad_norm": 0.548141598701477,
"learning_rate": 9.773869982563652e-05,
"loss": 0.4072,
"step": 3350
},
{
"epoch": 0.9905660377358491,
"grad_norm": 0.5914281010627747,
"learning_rate": 9.772486939083924e-05,
"loss": 0.41,
"step": 3360
},
{
"epoch": 0.9935141509433962,
"grad_norm": 0.5691133141517639,
"learning_rate": 9.771099777500709e-05,
"loss": 0.4379,
"step": 3370
},
{
"epoch": 0.9964622641509434,
"grad_norm": 0.5721046924591064,
"learning_rate": 9.769708499010966e-05,
"loss": 0.4381,
"step": 3380
},
{
"epoch": 0.9994103773584906,
"grad_norm": 0.8090994358062744,
"learning_rate": 9.768313104815207e-05,
"loss": 0.4141,
"step": 3390
},
{
"epoch": 1.0023584905660377,
"grad_norm": 0.6600131392478943,
"learning_rate": 9.766913596117498e-05,
"loss": 0.4385,
"step": 3400
},
{
"epoch": 1.005306603773585,
"grad_norm": 0.5856651067733765,
"learning_rate": 9.765509974125448e-05,
"loss": 0.4249,
"step": 3410
},
{
"epoch": 1.008254716981132,
"grad_norm": 0.8616664409637451,
"learning_rate": 9.764102240050225e-05,
"loss": 0.4108,
"step": 3420
},
{
"epoch": 1.0112028301886793,
"grad_norm": 0.6368763446807861,
"learning_rate": 9.762690395106541e-05,
"loss": 0.433,
"step": 3430
},
{
"epoch": 1.0141509433962264,
"grad_norm": 0.7372316718101501,
"learning_rate": 9.761274440512652e-05,
"loss": 0.4457,
"step": 3440
},
{
"epoch": 1.0170990566037736,
"grad_norm": 0.5737775564193726,
"learning_rate": 9.75985437749036e-05,
"loss": 0.419,
"step": 3450
},
{
"epoch": 1.0200471698113207,
"grad_norm": 0.49331873655319214,
"learning_rate": 9.758430207265021e-05,
"loss": 0.4443,
"step": 3460
},
{
"epoch": 1.022995283018868,
"grad_norm": 0.612887442111969,
"learning_rate": 9.757001931065526e-05,
"loss": 0.4523,
"step": 3470
},
{
"epoch": 1.025943396226415,
"grad_norm": 0.6426276564598083,
"learning_rate": 9.755569550124313e-05,
"loss": 0.4368,
"step": 3480
},
{
"epoch": 1.0288915094339623,
"grad_norm": 0.6959813833236694,
"learning_rate": 9.75413306567736e-05,
"loss": 0.4194,
"step": 3490
},
{
"epoch": 1.0318396226415094,
"grad_norm": 0.6095828413963318,
"learning_rate": 9.752692478964187e-05,
"loss": 0.4253,
"step": 3500
},
{
"epoch": 1.0347877358490567,
"grad_norm": 0.810645580291748,
"learning_rate": 9.751247791227852e-05,
"loss": 0.4317,
"step": 3510
},
{
"epoch": 1.0377358490566038,
"grad_norm": 0.60860675573349,
"learning_rate": 9.749799003714954e-05,
"loss": 0.43,
"step": 3520
},
{
"epoch": 1.040683962264151,
"grad_norm": 0.52146977186203,
"learning_rate": 9.74834611767563e-05,
"loss": 0.4174,
"step": 3530
},
{
"epoch": 1.0436320754716981,
"grad_norm": 0.534828245639801,
"learning_rate": 9.746889134363552e-05,
"loss": 0.4369,
"step": 3540
},
{
"epoch": 1.0465801886792452,
"grad_norm": 0.6910519599914551,
"learning_rate": 9.745428055035928e-05,
"loss": 0.4277,
"step": 3550
},
{
"epoch": 1.0495283018867925,
"grad_norm": 0.5335171222686768,
"learning_rate": 9.7439628809535e-05,
"loss": 0.4257,
"step": 3560
},
{
"epoch": 1.0524764150943395,
"grad_norm": 0.6715592741966248,
"learning_rate": 9.742493613380544e-05,
"loss": 0.4616,
"step": 3570
},
{
"epoch": 1.0554245283018868,
"grad_norm": 0.6723374128341675,
"learning_rate": 9.741020253584865e-05,
"loss": 0.4155,
"step": 3580
},
{
"epoch": 1.0583726415094339,
"grad_norm": 0.6973185539245605,
"learning_rate": 9.739542802837804e-05,
"loss": 0.4518,
"step": 3590
},
{
"epoch": 1.0613207547169812,
"grad_norm": 0.5881571769714355,
"learning_rate": 9.738061262414231e-05,
"loss": 0.4224,
"step": 3600
},
{
"epoch": 1.0642688679245282,
"grad_norm": 0.5016629099845886,
"learning_rate": 9.736575633592542e-05,
"loss": 0.4059,
"step": 3610
},
{
"epoch": 1.0672169811320755,
"grad_norm": 0.6979458332061768,
"learning_rate": 9.735085917654662e-05,
"loss": 0.4281,
"step": 3620
},
{
"epoch": 1.0701650943396226,
"grad_norm": 0.624667227268219,
"learning_rate": 9.733592115886047e-05,
"loss": 0.4257,
"step": 3630
},
{
"epoch": 1.0731132075471699,
"grad_norm": 0.5829150676727295,
"learning_rate": 9.73209422957567e-05,
"loss": 0.4454,
"step": 3640
},
{
"epoch": 1.076061320754717,
"grad_norm": 0.5612331032752991,
"learning_rate": 9.730592260016038e-05,
"loss": 0.4344,
"step": 3650
},
{
"epoch": 1.0790094339622642,
"grad_norm": 0.7450082302093506,
"learning_rate": 9.729086208503174e-05,
"loss": 0.44,
"step": 3660
},
{
"epoch": 1.0819575471698113,
"grad_norm": 0.6346138715744019,
"learning_rate": 9.727576076336626e-05,
"loss": 0.4358,
"step": 3670
},
{
"epoch": 1.0849056603773586,
"grad_norm": 0.8000290989875793,
"learning_rate": 9.726061864819464e-05,
"loss": 0.4166,
"step": 3680
},
{
"epoch": 1.0878537735849056,
"grad_norm": 0.6139888167381287,
"learning_rate": 9.724543575258277e-05,
"loss": 0.4393,
"step": 3690
},
{
"epoch": 1.0908018867924527,
"grad_norm": 0.6680110692977905,
"learning_rate": 9.723021208963175e-05,
"loss": 0.4368,
"step": 3700
},
{
"epoch": 1.09375,
"grad_norm": 0.5566927790641785,
"learning_rate": 9.721494767247779e-05,
"loss": 0.4563,
"step": 3710
},
{
"epoch": 1.0966981132075473,
"grad_norm": 0.6259219646453857,
"learning_rate": 9.719964251429236e-05,
"loss": 0.4645,
"step": 3720
},
{
"epoch": 1.0996462264150944,
"grad_norm": 1.1889296770095825,
"learning_rate": 9.7184296628282e-05,
"loss": 0.4477,
"step": 3730
},
{
"epoch": 1.1025943396226414,
"grad_norm": 0.6179032325744629,
"learning_rate": 9.716891002768848e-05,
"loss": 0.4299,
"step": 3740
},
{
"epoch": 1.1055424528301887,
"grad_norm": 0.6501899361610413,
"learning_rate": 9.715348272578862e-05,
"loss": 0.4167,
"step": 3750
},
{
"epoch": 1.1084905660377358,
"grad_norm": 0.5426408052444458,
"learning_rate": 9.71380147358944e-05,
"loss": 0.45,
"step": 3760
},
{
"epoch": 1.111438679245283,
"grad_norm": 0.8587615489959717,
"learning_rate": 9.71225060713529e-05,
"loss": 0.4085,
"step": 3770
},
{
"epoch": 1.1143867924528301,
"grad_norm": 0.5461919903755188,
"learning_rate": 9.71069567455463e-05,
"loss": 0.4297,
"step": 3780
},
{
"epoch": 1.1173349056603774,
"grad_norm": 0.665952205657959,
"learning_rate": 9.70913667718919e-05,
"loss": 0.4132,
"step": 3790
},
{
"epoch": 1.1202830188679245,
"grad_norm": 0.6407999396324158,
"learning_rate": 9.7075736163842e-05,
"loss": 0.4034,
"step": 3800
},
{
"epoch": 1.1232311320754718,
"grad_norm": 0.691685140132904,
"learning_rate": 9.706006493488402e-05,
"loss": 0.4344,
"step": 3810
},
{
"epoch": 1.1261792452830188,
"grad_norm": 0.6893350481987,
"learning_rate": 9.704435309854043e-05,
"loss": 0.4302,
"step": 3820
},
{
"epoch": 1.1291273584905661,
"grad_norm": 0.5995667576789856,
"learning_rate": 9.70286006683687e-05,
"loss": 0.4318,
"step": 3830
},
{
"epoch": 1.1320754716981132,
"grad_norm": 0.6128852367401123,
"learning_rate": 9.701280765796137e-05,
"loss": 0.4199,
"step": 3840
},
{
"epoch": 1.1350235849056605,
"grad_norm": 0.6381848454475403,
"learning_rate": 9.699697408094597e-05,
"loss": 0.4228,
"step": 3850
},
{
"epoch": 1.1379716981132075,
"grad_norm": 0.6368187665939331,
"learning_rate": 9.698109995098505e-05,
"loss": 0.4354,
"step": 3860
},
{
"epoch": 1.1409198113207548,
"grad_norm": 0.7368292808532715,
"learning_rate": 9.696518528177613e-05,
"loss": 0.4198,
"step": 3870
},
{
"epoch": 1.1438679245283019,
"grad_norm": 0.8033089637756348,
"learning_rate": 9.694923008705177e-05,
"loss": 0.425,
"step": 3880
},
{
"epoch": 1.146816037735849,
"grad_norm": 0.6468704342842102,
"learning_rate": 9.69332343805794e-05,
"loss": 0.4699,
"step": 3890
},
{
"epoch": 1.1497641509433962,
"grad_norm": 0.593371570110321,
"learning_rate": 9.691719817616147e-05,
"loss": 0.4161,
"step": 3900
},
{
"epoch": 1.1527122641509433,
"grad_norm": 0.607347309589386,
"learning_rate": 9.690112148763542e-05,
"loss": 0.4377,
"step": 3910
},
{
"epoch": 1.1556603773584906,
"grad_norm": 0.7377333641052246,
"learning_rate": 9.688500432887351e-05,
"loss": 0.4286,
"step": 3920
},
{
"epoch": 1.1586084905660377,
"grad_norm": 0.7115645408630371,
"learning_rate": 9.6868846713783e-05,
"loss": 0.4135,
"step": 3930
},
{
"epoch": 1.161556603773585,
"grad_norm": 0.6309870481491089,
"learning_rate": 9.685264865630605e-05,
"loss": 0.44,
"step": 3940
},
{
"epoch": 1.164504716981132,
"grad_norm": 0.6622278094291687,
"learning_rate": 9.683641017041972e-05,
"loss": 0.4221,
"step": 3950
},
{
"epoch": 1.1674528301886793,
"grad_norm": 0.6244953274726868,
"learning_rate": 9.68201312701359e-05,
"loss": 0.4204,
"step": 3960
},
{
"epoch": 1.1704009433962264,
"grad_norm": 0.5539456009864807,
"learning_rate": 9.680381196950143e-05,
"loss": 0.427,
"step": 3970
},
{
"epoch": 1.1733490566037736,
"grad_norm": 0.7474551796913147,
"learning_rate": 9.678745228259798e-05,
"loss": 0.4168,
"step": 3980
},
{
"epoch": 1.1762971698113207,
"grad_norm": 0.7025018930435181,
"learning_rate": 9.677105222354203e-05,
"loss": 0.4346,
"step": 3990
},
{
"epoch": 1.179245283018868,
"grad_norm": 0.5474650859832764,
"learning_rate": 9.675461180648498e-05,
"loss": 0.452,
"step": 4000
},
{
"epoch": 1.179245283018868,
"eval_runtime": 2152.6847,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.525,
"step": 4000
},
{
"epoch": 1.182193396226415,
"grad_norm": 0.6443697810173035,
"learning_rate": 9.673813104561295e-05,
"loss": 0.4097,
"step": 4010
},
{
"epoch": 1.1851415094339623,
"grad_norm": 0.6092158555984497,
"learning_rate": 9.672160995514696e-05,
"loss": 0.4491,
"step": 4020
},
{
"epoch": 1.1880896226415094,
"grad_norm": 0.7118765711784363,
"learning_rate": 9.670504854934281e-05,
"loss": 0.4221,
"step": 4030
},
{
"epoch": 1.1910377358490567,
"grad_norm": 0.7270048260688782,
"learning_rate": 9.668844684249106e-05,
"loss": 0.436,
"step": 4040
},
{
"epoch": 1.1939858490566038,
"grad_norm": 0.680564820766449,
"learning_rate": 9.667180484891706e-05,
"loss": 0.449,
"step": 4050
},
{
"epoch": 1.196933962264151,
"grad_norm": 0.8482938408851624,
"learning_rate": 9.665512258298092e-05,
"loss": 0.4195,
"step": 4060
},
{
"epoch": 1.1998820754716981,
"grad_norm": 0.6847683191299438,
"learning_rate": 9.66384000590775e-05,
"loss": 0.4467,
"step": 4070
},
{
"epoch": 1.2028301886792452,
"grad_norm": 0.7688137292861938,
"learning_rate": 9.662163729163642e-05,
"loss": 0.4261,
"step": 4080
},
{
"epoch": 1.2057783018867925,
"grad_norm": 0.6692184209823608,
"learning_rate": 9.660483429512199e-05,
"loss": 0.4168,
"step": 4090
},
{
"epoch": 1.2087264150943395,
"grad_norm": 0.677253007888794,
"learning_rate": 9.658799108403324e-05,
"loss": 0.4315,
"step": 4100
},
{
"epoch": 1.2116745283018868,
"grad_norm": 0.5852985978126526,
"learning_rate": 9.657110767290394e-05,
"loss": 0.404,
"step": 4110
},
{
"epoch": 1.2146226415094339,
"grad_norm": 0.6200850605964661,
"learning_rate": 9.65541840763025e-05,
"loss": 0.4096,
"step": 4120
},
{
"epoch": 1.2175707547169812,
"grad_norm": 0.7625101804733276,
"learning_rate": 9.653722030883204e-05,
"loss": 0.4171,
"step": 4130
},
{
"epoch": 1.2205188679245282,
"grad_norm": 0.6981749534606934,
"learning_rate": 9.65202163851303e-05,
"loss": 0.4306,
"step": 4140
},
{
"epoch": 1.2234669811320755,
"grad_norm": 0.7978874444961548,
"learning_rate": 9.650317231986971e-05,
"loss": 0.4347,
"step": 4150
},
{
"epoch": 1.2264150943396226,
"grad_norm": 0.5996639728546143,
"learning_rate": 9.648608812775734e-05,
"loss": 0.4252,
"step": 4160
},
{
"epoch": 1.2293632075471699,
"grad_norm": 0.6235437393188477,
"learning_rate": 9.646896382353483e-05,
"loss": 0.3944,
"step": 4170
},
{
"epoch": 1.232311320754717,
"grad_norm": 0.5777077078819275,
"learning_rate": 9.64517994219785e-05,
"loss": 0.4505,
"step": 4180
},
{
"epoch": 1.2352594339622642,
"grad_norm": 0.5862837433815002,
"learning_rate": 9.643459493789926e-05,
"loss": 0.4279,
"step": 4190
},
{
"epoch": 1.2382075471698113,
"grad_norm": 0.7375636696815491,
"learning_rate": 9.641735038614254e-05,
"loss": 0.4079,
"step": 4200
},
{
"epoch": 1.2411556603773586,
"grad_norm": 0.5712253451347351,
"learning_rate": 9.640006578158843e-05,
"loss": 0.4369,
"step": 4210
},
{
"epoch": 1.2441037735849056,
"grad_norm": 0.6323943734169006,
"learning_rate": 9.638274113915151e-05,
"loss": 0.4082,
"step": 4220
},
{
"epoch": 1.2470518867924527,
"grad_norm": 0.6632819771766663,
"learning_rate": 9.636537647378097e-05,
"loss": 0.4107,
"step": 4230
},
{
"epoch": 1.25,
"grad_norm": 0.5926976799964905,
"learning_rate": 9.634797180046049e-05,
"loss": 0.4354,
"step": 4240
},
{
"epoch": 1.2529481132075473,
"grad_norm": 0.5627853274345398,
"learning_rate": 9.633052713420827e-05,
"loss": 0.416,
"step": 4250
},
{
"epoch": 1.2558962264150944,
"grad_norm": 0.5623383522033691,
"learning_rate": 9.631304249007707e-05,
"loss": 0.4112,
"step": 4260
},
{
"epoch": 1.2588443396226414,
"grad_norm": 0.5470466613769531,
"learning_rate": 9.62955178831541e-05,
"loss": 0.4141,
"step": 4270
},
{
"epoch": 1.2617924528301887,
"grad_norm": 0.6820225119590759,
"learning_rate": 9.627795332856107e-05,
"loss": 0.4203,
"step": 4280
},
{
"epoch": 1.2647405660377358,
"grad_norm": 0.7964096665382385,
"learning_rate": 9.626034884145413e-05,
"loss": 0.4181,
"step": 4290
},
{
"epoch": 1.267688679245283,
"grad_norm": 0.5634545683860779,
"learning_rate": 9.624270443702395e-05,
"loss": 0.4302,
"step": 4300
},
{
"epoch": 1.2706367924528301,
"grad_norm": 0.49879544973373413,
"learning_rate": 9.622502013049557e-05,
"loss": 0.3951,
"step": 4310
},
{
"epoch": 1.2735849056603774,
"grad_norm": 0.8242729306221008,
"learning_rate": 9.620729593712854e-05,
"loss": 0.4195,
"step": 4320
},
{
"epoch": 1.2765330188679245,
"grad_norm": 0.6598034501075745,
"learning_rate": 9.618953187221676e-05,
"loss": 0.397,
"step": 4330
},
{
"epoch": 1.2794811320754718,
"grad_norm": 0.5287206172943115,
"learning_rate": 9.617172795108857e-05,
"loss": 0.4392,
"step": 4340
},
{
"epoch": 1.2824292452830188,
"grad_norm": 0.6088510155677795,
"learning_rate": 9.615388418910667e-05,
"loss": 0.4443,
"step": 4350
},
{
"epoch": 1.2853773584905661,
"grad_norm": 0.4323548376560211,
"learning_rate": 9.61360006016682e-05,
"loss": 0.3923,
"step": 4360
},
{
"epoch": 1.2883254716981132,
"grad_norm": 0.5699283480644226,
"learning_rate": 9.611807720420458e-05,
"loss": 0.4335,
"step": 4370
},
{
"epoch": 1.2912735849056602,
"grad_norm": 0.7797152400016785,
"learning_rate": 9.610011401218167e-05,
"loss": 0.4022,
"step": 4380
},
{
"epoch": 1.2942216981132075,
"grad_norm": 0.5590456128120422,
"learning_rate": 9.60821110410996e-05,
"loss": 0.4304,
"step": 4390
},
{
"epoch": 1.2971698113207548,
"grad_norm": 0.562135636806488,
"learning_rate": 9.606406830649283e-05,
"loss": 0.4112,
"step": 4400
},
{
"epoch": 1.3001179245283019,
"grad_norm": 0.7220986485481262,
"learning_rate": 9.604598582393021e-05,
"loss": 0.4413,
"step": 4410
},
{
"epoch": 1.303066037735849,
"grad_norm": 0.6028885245323181,
"learning_rate": 9.602786360901477e-05,
"loss": 0.4422,
"step": 4420
},
{
"epoch": 1.3060141509433962,
"grad_norm": 0.617222785949707,
"learning_rate": 9.60097016773839e-05,
"loss": 0.4342,
"step": 4430
},
{
"epoch": 1.3089622641509435,
"grad_norm": 0.7376841902732849,
"learning_rate": 9.599150004470925e-05,
"loss": 0.435,
"step": 4440
},
{
"epoch": 1.3119103773584906,
"grad_norm": 0.4664765000343323,
"learning_rate": 9.597325872669671e-05,
"loss": 0.4006,
"step": 4450
},
{
"epoch": 1.3148584905660377,
"grad_norm": 0.6231231689453125,
"learning_rate": 9.595497773908644e-05,
"loss": 0.4029,
"step": 4460
},
{
"epoch": 1.317806603773585,
"grad_norm": 0.5825844407081604,
"learning_rate": 9.59366570976528e-05,
"loss": 0.4222,
"step": 4470
},
{
"epoch": 1.320754716981132,
"grad_norm": 0.588737428188324,
"learning_rate": 9.591829681820441e-05,
"loss": 0.436,
"step": 4480
},
{
"epoch": 1.3237028301886793,
"grad_norm": 0.5719475746154785,
"learning_rate": 9.589989691658404e-05,
"loss": 0.4374,
"step": 4490
},
{
"epoch": 1.3266509433962264,
"grad_norm": 0.5231117010116577,
"learning_rate": 9.588145740866865e-05,
"loss": 0.4267,
"step": 4500
},
{
"epoch": 1.3295990566037736,
"grad_norm": 0.6021905541419983,
"learning_rate": 9.586297831036945e-05,
"loss": 0.4348,
"step": 4510
},
{
"epoch": 1.3325471698113207,
"grad_norm": 0.5159986019134521,
"learning_rate": 9.584445963763173e-05,
"loss": 0.4161,
"step": 4520
},
{
"epoch": 1.335495283018868,
"grad_norm": 0.584114134311676,
"learning_rate": 9.582590140643497e-05,
"loss": 0.3946,
"step": 4530
},
{
"epoch": 1.338443396226415,
"grad_norm": 0.6271203756332397,
"learning_rate": 9.580730363279278e-05,
"loss": 0.4146,
"step": 4540
},
{
"epoch": 1.3413915094339623,
"grad_norm": 0.594464898109436,
"learning_rate": 9.578866633275288e-05,
"loss": 0.4089,
"step": 4550
},
{
"epoch": 1.3443396226415094,
"grad_norm": 0.5215420126914978,
"learning_rate": 9.576998952239708e-05,
"loss": 0.4282,
"step": 4560
},
{
"epoch": 1.3472877358490565,
"grad_norm": 0.893401026725769,
"learning_rate": 9.575127321784135e-05,
"loss": 0.4048,
"step": 4570
},
{
"epoch": 1.3502358490566038,
"grad_norm": 0.5551811456680298,
"learning_rate": 9.573251743523565e-05,
"loss": 0.407,
"step": 4580
},
{
"epoch": 1.353183962264151,
"grad_norm": 0.9592118263244629,
"learning_rate": 9.571372219076407e-05,
"loss": 0.4159,
"step": 4590
},
{
"epoch": 1.3561320754716981,
"grad_norm": 0.6965057253837585,
"learning_rate": 9.569488750064472e-05,
"loss": 0.4397,
"step": 4600
},
{
"epoch": 1.3590801886792452,
"grad_norm": 0.6164469122886658,
"learning_rate": 9.567601338112976e-05,
"loss": 0.4184,
"step": 4610
},
{
"epoch": 1.3620283018867925,
"grad_norm": 0.5811320543289185,
"learning_rate": 9.565709984850537e-05,
"loss": 0.4096,
"step": 4620
},
{
"epoch": 1.3649764150943398,
"grad_norm": 0.5724498629570007,
"learning_rate": 9.563814691909173e-05,
"loss": 0.4274,
"step": 4630
},
{
"epoch": 1.3679245283018868,
"grad_norm": 0.6571381092071533,
"learning_rate": 9.561915460924305e-05,
"loss": 0.396,
"step": 4640
},
{
"epoch": 1.3708726415094339,
"grad_norm": 0.6680477261543274,
"learning_rate": 9.560012293534746e-05,
"loss": 0.4363,
"step": 4650
},
{
"epoch": 1.3738207547169812,
"grad_norm": 0.7330737113952637,
"learning_rate": 9.55810519138271e-05,
"loss": 0.4105,
"step": 4660
},
{
"epoch": 1.3767688679245282,
"grad_norm": 0.5385764837265015,
"learning_rate": 9.556194156113807e-05,
"loss": 0.3802,
"step": 4670
},
{
"epoch": 1.3797169811320755,
"grad_norm": 0.6339823603630066,
"learning_rate": 9.554279189377035e-05,
"loss": 0.4131,
"step": 4680
},
{
"epoch": 1.3826650943396226,
"grad_norm": 0.5736427307128906,
"learning_rate": 9.552360292824795e-05,
"loss": 0.412,
"step": 4690
},
{
"epoch": 1.3856132075471699,
"grad_norm": 0.6890711188316345,
"learning_rate": 9.550437468112868e-05,
"loss": 0.4355,
"step": 4700
},
{
"epoch": 1.388561320754717,
"grad_norm": 0.5251554250717163,
"learning_rate": 9.548510716900427e-05,
"loss": 0.4146,
"step": 4710
},
{
"epoch": 1.3915094339622642,
"grad_norm": 0.584486186504364,
"learning_rate": 9.54658004085004e-05,
"loss": 0.4115,
"step": 4720
},
{
"epoch": 1.3944575471698113,
"grad_norm": 0.5341463685035706,
"learning_rate": 9.544645441627656e-05,
"loss": 0.3943,
"step": 4730
},
{
"epoch": 1.3974056603773586,
"grad_norm": 0.6700941324234009,
"learning_rate": 9.542706920902606e-05,
"loss": 0.4069,
"step": 4740
},
{
"epoch": 1.4003537735849056,
"grad_norm": 0.6363126039505005,
"learning_rate": 9.540764480347615e-05,
"loss": 0.4236,
"step": 4750
},
{
"epoch": 1.4033018867924527,
"grad_norm": 0.585870087146759,
"learning_rate": 9.538818121638779e-05,
"loss": 0.4384,
"step": 4760
},
{
"epoch": 1.40625,
"grad_norm": 0.7579383254051208,
"learning_rate": 9.536867846455582e-05,
"loss": 0.4231,
"step": 4770
},
{
"epoch": 1.4091981132075473,
"grad_norm": 0.5836581587791443,
"learning_rate": 9.534913656480886e-05,
"loss": 0.4165,
"step": 4780
},
{
"epoch": 1.4121462264150944,
"grad_norm": 0.5949292182922363,
"learning_rate": 9.53295555340093e-05,
"loss": 0.4399,
"step": 4790
},
{
"epoch": 1.4150943396226414,
"grad_norm": 0.5725260972976685,
"learning_rate": 9.530993538905331e-05,
"loss": 0.4151,
"step": 4800
},
{
"epoch": 1.4180424528301887,
"grad_norm": 0.6396368145942688,
"learning_rate": 9.529027614687081e-05,
"loss": 0.427,
"step": 4810
},
{
"epoch": 1.4209905660377358,
"grad_norm": 0.6774426102638245,
"learning_rate": 9.527057782442542e-05,
"loss": 0.4157,
"step": 4820
},
{
"epoch": 1.423938679245283,
"grad_norm": 0.6440579891204834,
"learning_rate": 9.525084043871452e-05,
"loss": 0.4245,
"step": 4830
},
{
"epoch": 1.4268867924528301,
"grad_norm": 0.7302789092063904,
"learning_rate": 9.523106400676923e-05,
"loss": 0.3854,
"step": 4840
},
{
"epoch": 1.4298349056603774,
"grad_norm": 0.6234352588653564,
"learning_rate": 9.521124854565425e-05,
"loss": 0.3888,
"step": 4850
},
{
"epoch": 1.4327830188679245,
"grad_norm": 0.6490001082420349,
"learning_rate": 9.519139407246807e-05,
"loss": 0.4036,
"step": 4860
},
{
"epoch": 1.4357311320754718,
"grad_norm": 0.7030500173568726,
"learning_rate": 9.517150060434281e-05,
"loss": 0.4072,
"step": 4870
},
{
"epoch": 1.4386792452830188,
"grad_norm": 0.5455010533332825,
"learning_rate": 9.51515681584442e-05,
"loss": 0.4187,
"step": 4880
},
{
"epoch": 1.4416273584905661,
"grad_norm": 0.5865360498428345,
"learning_rate": 9.513159675197166e-05,
"loss": 0.4159,
"step": 4890
},
{
"epoch": 1.4445754716981132,
"grad_norm": 0.6106439232826233,
"learning_rate": 9.511158640215819e-05,
"loss": 0.4162,
"step": 4900
},
{
"epoch": 1.4475235849056602,
"grad_norm": 0.9382206201553345,
"learning_rate": 9.509153712627037e-05,
"loss": 0.4209,
"step": 4910
},
{
"epoch": 1.4504716981132075,
"grad_norm": 0.5971977710723877,
"learning_rate": 9.507144894160847e-05,
"loss": 0.4144,
"step": 4920
},
{
"epoch": 1.4534198113207548,
"grad_norm": 0.684509813785553,
"learning_rate": 9.505132186550621e-05,
"loss": 0.4383,
"step": 4930
},
{
"epoch": 1.4563679245283019,
"grad_norm": 0.751724123954773,
"learning_rate": 9.503115591533094e-05,
"loss": 0.4047,
"step": 4940
},
{
"epoch": 1.459316037735849,
"grad_norm": 0.48539936542510986,
"learning_rate": 9.501095110848356e-05,
"loss": 0.4072,
"step": 4950
},
{
"epoch": 1.4622641509433962,
"grad_norm": 0.641118049621582,
"learning_rate": 9.499070746239845e-05,
"loss": 0.421,
"step": 4960
},
{
"epoch": 1.4652122641509435,
"grad_norm": 0.4774479269981384,
"learning_rate": 9.497042499454357e-05,
"loss": 0.4289,
"step": 4970
},
{
"epoch": 1.4681603773584906,
"grad_norm": 0.7362788319587708,
"learning_rate": 9.49501037224203e-05,
"loss": 0.3889,
"step": 4980
},
{
"epoch": 1.4711084905660377,
"grad_norm": 0.7180984020233154,
"learning_rate": 9.492974366356355e-05,
"loss": 0.4329,
"step": 4990
},
{
"epoch": 1.474056603773585,
"grad_norm": 0.6001200675964355,
"learning_rate": 9.490934483554172e-05,
"loss": 0.4361,
"step": 5000
},
{
"epoch": 1.474056603773585,
"eval_runtime": 2152.2039,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.526,
"step": 5000
},
{
"epoch": 1.477004716981132,
"grad_norm": 0.6098699569702148,
"learning_rate": 9.488890725595663e-05,
"loss": 0.4325,
"step": 5010
},
{
"epoch": 1.4799528301886793,
"grad_norm": 0.7573074102401733,
"learning_rate": 9.486843094244351e-05,
"loss": 0.4046,
"step": 5020
},
{
"epoch": 1.4829009433962264,
"grad_norm": 0.9371487498283386,
"learning_rate": 9.484791591267109e-05,
"loss": 0.4328,
"step": 5030
},
{
"epoch": 1.4858490566037736,
"grad_norm": 0.6104595065116882,
"learning_rate": 9.482736218434143e-05,
"loss": 0.4361,
"step": 5040
},
{
"epoch": 1.4887971698113207,
"grad_norm": 0.8826286792755127,
"learning_rate": 9.480676977519004e-05,
"loss": 0.4223,
"step": 5050
},
{
"epoch": 1.491745283018868,
"grad_norm": 0.5676646828651428,
"learning_rate": 9.478613870298578e-05,
"loss": 0.4372,
"step": 5060
},
{
"epoch": 1.494693396226415,
"grad_norm": 0.6027317047119141,
"learning_rate": 9.476546898553088e-05,
"loss": 0.4093,
"step": 5070
},
{
"epoch": 1.4976415094339623,
"grad_norm": 1.0308340787887573,
"learning_rate": 9.474476064066088e-05,
"loss": 0.4222,
"step": 5080
},
{
"epoch": 1.5005896226415094,
"grad_norm": 0.5098292827606201,
"learning_rate": 9.472401368624473e-05,
"loss": 0.4413,
"step": 5090
},
{
"epoch": 1.5035377358490565,
"grad_norm": 0.4316754937171936,
"learning_rate": 9.47032281401846e-05,
"loss": 0.4151,
"step": 5100
},
{
"epoch": 1.5064858490566038,
"grad_norm": 0.5580949187278748,
"learning_rate": 9.468240402041607e-05,
"loss": 0.4108,
"step": 5110
},
{
"epoch": 1.509433962264151,
"grad_norm": 0.554026186466217,
"learning_rate": 9.466154134490789e-05,
"loss": 0.4384,
"step": 5120
},
{
"epoch": 1.5123820754716981,
"grad_norm": 0.5462006330490112,
"learning_rate": 9.464064013166216e-05,
"loss": 0.4097,
"step": 5130
},
{
"epoch": 1.5153301886792452,
"grad_norm": 0.6762437224388123,
"learning_rate": 9.461970039871419e-05,
"loss": 0.4112,
"step": 5140
},
{
"epoch": 1.5182783018867925,
"grad_norm": 0.45170068740844727,
"learning_rate": 9.459872216413255e-05,
"loss": 0.4127,
"step": 5150
},
{
"epoch": 1.5212264150943398,
"grad_norm": 0.6126338243484497,
"learning_rate": 9.457770544601904e-05,
"loss": 0.418,
"step": 5160
},
{
"epoch": 1.5241745283018868,
"grad_norm": 0.5843132734298706,
"learning_rate": 9.455665026250864e-05,
"loss": 0.4128,
"step": 5170
},
{
"epoch": 1.5271226415094339,
"grad_norm": 0.5920013785362244,
"learning_rate": 9.453555663176954e-05,
"loss": 0.4287,
"step": 5180
},
{
"epoch": 1.5300707547169812,
"grad_norm": 0.5336861610412598,
"learning_rate": 9.451442457200308e-05,
"loss": 0.4395,
"step": 5190
},
{
"epoch": 1.5330188679245285,
"grad_norm": 0.6159524321556091,
"learning_rate": 9.449325410144382e-05,
"loss": 0.4172,
"step": 5200
},
{
"epoch": 1.5359669811320755,
"grad_norm": 0.5574519038200378,
"learning_rate": 9.447204523835939e-05,
"loss": 0.4053,
"step": 5210
},
{
"epoch": 1.5389150943396226,
"grad_norm": 0.7957310080528259,
"learning_rate": 9.44507980010506e-05,
"loss": 0.4295,
"step": 5220
},
{
"epoch": 1.5418632075471699,
"grad_norm": 0.6264283657073975,
"learning_rate": 9.442951240785135e-05,
"loss": 0.4461,
"step": 5230
},
{
"epoch": 1.544811320754717,
"grad_norm": 0.6128536462783813,
"learning_rate": 9.440818847712865e-05,
"loss": 0.4084,
"step": 5240
},
{
"epoch": 1.547759433962264,
"grad_norm": 0.7732954025268555,
"learning_rate": 9.438682622728256e-05,
"loss": 0.4321,
"step": 5250
},
{
"epoch": 1.5507075471698113,
"grad_norm": 0.5646283030509949,
"learning_rate": 9.436542567674625e-05,
"loss": 0.4135,
"step": 5260
},
{
"epoch": 1.5536556603773586,
"grad_norm": 0.5429633855819702,
"learning_rate": 9.43439868439859e-05,
"loss": 0.4171,
"step": 5270
},
{
"epoch": 1.5566037735849056,
"grad_norm": 0.6298280358314514,
"learning_rate": 9.432250974750074e-05,
"loss": 0.4336,
"step": 5280
},
{
"epoch": 1.5595518867924527,
"grad_norm": 0.509488046169281,
"learning_rate": 9.430099440582305e-05,
"loss": 0.4052,
"step": 5290
},
{
"epoch": 1.5625,
"grad_norm": 0.5488308668136597,
"learning_rate": 9.427944083751803e-05,
"loss": 0.4033,
"step": 5300
},
{
"epoch": 1.5654481132075473,
"grad_norm": 0.6165297627449036,
"learning_rate": 9.425784906118394e-05,
"loss": 0.4547,
"step": 5310
},
{
"epoch": 1.5683962264150944,
"grad_norm": 0.7992329597473145,
"learning_rate": 9.4236219095452e-05,
"loss": 0.4244,
"step": 5320
},
{
"epoch": 1.5713443396226414,
"grad_norm": 0.6273894309997559,
"learning_rate": 9.421455095898631e-05,
"loss": 0.423,
"step": 5330
},
{
"epoch": 1.5742924528301887,
"grad_norm": 0.5948377251625061,
"learning_rate": 9.419284467048401e-05,
"loss": 0.3893,
"step": 5340
},
{
"epoch": 1.577240566037736,
"grad_norm": 0.6253712773323059,
"learning_rate": 9.41711002486751e-05,
"loss": 0.4133,
"step": 5350
},
{
"epoch": 1.580188679245283,
"grad_norm": 0.48556405305862427,
"learning_rate": 9.41493177123225e-05,
"loss": 0.4082,
"step": 5360
},
{
"epoch": 1.5831367924528301,
"grad_norm": 0.567561149597168,
"learning_rate": 9.412749708022201e-05,
"loss": 0.4134,
"step": 5370
},
{
"epoch": 1.5860849056603774,
"grad_norm": 0.7854056358337402,
"learning_rate": 9.41056383712023e-05,
"loss": 0.4141,
"step": 5380
},
{
"epoch": 1.5890330188679245,
"grad_norm": 0.6564404368400574,
"learning_rate": 9.408374160412493e-05,
"loss": 0.4231,
"step": 5390
},
{
"epoch": 1.5919811320754715,
"grad_norm": 0.6413495540618896,
"learning_rate": 9.406180679788423e-05,
"loss": 0.4122,
"step": 5400
},
{
"epoch": 1.5949292452830188,
"grad_norm": 0.7277830839157104,
"learning_rate": 9.403983397140745e-05,
"loss": 0.4228,
"step": 5410
},
{
"epoch": 1.5978773584905661,
"grad_norm": 0.7624497413635254,
"learning_rate": 9.401782314365457e-05,
"loss": 0.4409,
"step": 5420
},
{
"epoch": 1.6008254716981132,
"grad_norm": 0.6038016080856323,
"learning_rate": 9.399577433361838e-05,
"loss": 0.431,
"step": 5430
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.6236042380332947,
"learning_rate": 9.397368756032445e-05,
"loss": 0.4324,
"step": 5440
},
{
"epoch": 1.6067216981132075,
"grad_norm": 0.7254793643951416,
"learning_rate": 9.395156284283113e-05,
"loss": 0.4101,
"step": 5450
},
{
"epoch": 1.6096698113207548,
"grad_norm": 0.5769889950752258,
"learning_rate": 9.392940020022946e-05,
"loss": 0.3998,
"step": 5460
},
{
"epoch": 1.6126179245283019,
"grad_norm": 0.5018852353096008,
"learning_rate": 9.390719965164323e-05,
"loss": 0.4063,
"step": 5470
},
{
"epoch": 1.615566037735849,
"grad_norm": 0.5254272222518921,
"learning_rate": 9.388496121622898e-05,
"loss": 0.4009,
"step": 5480
},
{
"epoch": 1.6185141509433962,
"grad_norm": 0.6437675952911377,
"learning_rate": 9.386268491317587e-05,
"loss": 0.3914,
"step": 5490
},
{
"epoch": 1.6214622641509435,
"grad_norm": 0.556174099445343,
"learning_rate": 9.384037076170577e-05,
"loss": 0.4394,
"step": 5500
},
{
"epoch": 1.6244103773584906,
"grad_norm": 0.5987389087677002,
"learning_rate": 9.381801878107323e-05,
"loss": 0.3962,
"step": 5510
},
{
"epoch": 1.6273584905660377,
"grad_norm": 0.6427960991859436,
"learning_rate": 9.379562899056542e-05,
"loss": 0.3865,
"step": 5520
},
{
"epoch": 1.630306603773585,
"grad_norm": 1.0582858324050903,
"learning_rate": 9.377320140950211e-05,
"loss": 0.4183,
"step": 5530
},
{
"epoch": 1.6332547169811322,
"grad_norm": 0.46756067872047424,
"learning_rate": 9.375073605723573e-05,
"loss": 0.3933,
"step": 5540
},
{
"epoch": 1.6362028301886793,
"grad_norm": 0.5007855296134949,
"learning_rate": 9.372823295315126e-05,
"loss": 0.4191,
"step": 5550
},
{
"epoch": 1.6391509433962264,
"grad_norm": 0.529598593711853,
"learning_rate": 9.370569211666628e-05,
"loss": 0.4349,
"step": 5560
},
{
"epoch": 1.6420990566037736,
"grad_norm": 0.5168789029121399,
"learning_rate": 9.368311356723091e-05,
"loss": 0.4234,
"step": 5570
},
{
"epoch": 1.6450471698113207,
"grad_norm": 0.7900917530059814,
"learning_rate": 9.366049732432786e-05,
"loss": 0.4146,
"step": 5580
},
{
"epoch": 1.6479952830188678,
"grad_norm": 0.6845492720603943,
"learning_rate": 9.363784340747228e-05,
"loss": 0.4027,
"step": 5590
},
{
"epoch": 1.650943396226415,
"grad_norm": 0.7339763045310974,
"learning_rate": 9.361515183621192e-05,
"loss": 0.3918,
"step": 5600
},
{
"epoch": 1.6538915094339623,
"grad_norm": 0.6561014652252197,
"learning_rate": 9.359242263012693e-05,
"loss": 0.412,
"step": 5610
},
{
"epoch": 1.6568396226415094,
"grad_norm": 0.6985560059547424,
"learning_rate": 9.356965580883004e-05,
"loss": 0.4465,
"step": 5620
},
{
"epoch": 1.6597877358490565,
"grad_norm": 0.9455322027206421,
"learning_rate": 9.354685139196633e-05,
"loss": 0.4297,
"step": 5630
},
{
"epoch": 1.6627358490566038,
"grad_norm": 0.5904430747032166,
"learning_rate": 9.35240093992134e-05,
"loss": 0.4339,
"step": 5640
},
{
"epoch": 1.665683962264151,
"grad_norm": 0.7788925170898438,
"learning_rate": 9.350112985028121e-05,
"loss": 0.431,
"step": 5650
},
{
"epoch": 1.6686320754716981,
"grad_norm": 0.6995145678520203,
"learning_rate": 9.34782127649122e-05,
"loss": 0.3985,
"step": 5660
},
{
"epoch": 1.6715801886792452,
"grad_norm": 0.626193642616272,
"learning_rate": 9.345525816288116e-05,
"loss": 0.4368,
"step": 5670
},
{
"epoch": 1.6745283018867925,
"grad_norm": 0.7121322154998779,
"learning_rate": 9.343226606399523e-05,
"loss": 0.3951,
"step": 5680
},
{
"epoch": 1.6774764150943398,
"grad_norm": 0.724651038646698,
"learning_rate": 9.340923648809392e-05,
"loss": 0.4104,
"step": 5690
},
{
"epoch": 1.6804245283018868,
"grad_norm": 0.6456372737884521,
"learning_rate": 9.338616945504912e-05,
"loss": 0.4429,
"step": 5700
},
{
"epoch": 1.6833726415094339,
"grad_norm": 0.48553207516670227,
"learning_rate": 9.336306498476499e-05,
"loss": 0.4126,
"step": 5710
},
{
"epoch": 1.6863207547169812,
"grad_norm": 0.6373352408409119,
"learning_rate": 9.333992309717801e-05,
"loss": 0.4193,
"step": 5720
},
{
"epoch": 1.6892688679245285,
"grad_norm": 0.6522802710533142,
"learning_rate": 9.331674381225696e-05,
"loss": 0.4301,
"step": 5730
},
{
"epoch": 1.6922169811320755,
"grad_norm": 0.5400438904762268,
"learning_rate": 9.329352715000285e-05,
"loss": 0.3763,
"step": 5740
},
{
"epoch": 1.6951650943396226,
"grad_norm": 0.46488505601882935,
"learning_rate": 9.327027313044901e-05,
"loss": 0.4229,
"step": 5750
},
{
"epoch": 1.6981132075471699,
"grad_norm": 0.5294913053512573,
"learning_rate": 9.324698177366095e-05,
"loss": 0.4434,
"step": 5760
},
{
"epoch": 1.701061320754717,
"grad_norm": 0.5483437180519104,
"learning_rate": 9.32236530997364e-05,
"loss": 0.401,
"step": 5770
},
{
"epoch": 1.704009433962264,
"grad_norm": 0.6428307294845581,
"learning_rate": 9.320028712880531e-05,
"loss": 0.4046,
"step": 5780
},
{
"epoch": 1.7069575471698113,
"grad_norm": 0.6045493483543396,
"learning_rate": 9.317688388102983e-05,
"loss": 0.4203,
"step": 5790
},
{
"epoch": 1.7099056603773586,
"grad_norm": 0.6167967915534973,
"learning_rate": 9.315344337660421e-05,
"loss": 0.4151,
"step": 5800
},
{
"epoch": 1.7128537735849056,
"grad_norm": 0.5861966609954834,
"learning_rate": 9.312996563575492e-05,
"loss": 0.4161,
"step": 5810
},
{
"epoch": 1.7158018867924527,
"grad_norm": 0.5049532055854797,
"learning_rate": 9.310645067874053e-05,
"loss": 0.4365,
"step": 5820
},
{
"epoch": 1.71875,
"grad_norm": 0.5511918663978577,
"learning_rate": 9.308289852585172e-05,
"loss": 0.414,
"step": 5830
},
{
"epoch": 1.7216981132075473,
"grad_norm": 0.5320383310317993,
"learning_rate": 9.305930919741126e-05,
"loss": 0.42,
"step": 5840
},
{
"epoch": 1.7246462264150944,
"grad_norm": 0.6752457618713379,
"learning_rate": 9.303568271377403e-05,
"loss": 0.4458,
"step": 5850
},
{
"epoch": 1.7275943396226414,
"grad_norm": 0.5691211819648743,
"learning_rate": 9.301201909532693e-05,
"loss": 0.4326,
"step": 5860
},
{
"epoch": 1.7305424528301887,
"grad_norm": 0.5890153050422668,
"learning_rate": 9.298831836248892e-05,
"loss": 0.4151,
"step": 5870
},
{
"epoch": 1.733490566037736,
"grad_norm": 0.6677072644233704,
"learning_rate": 9.2964580535711e-05,
"loss": 0.4058,
"step": 5880
},
{
"epoch": 1.736438679245283,
"grad_norm": 0.6226016879081726,
"learning_rate": 9.294080563547617e-05,
"loss": 0.4332,
"step": 5890
},
{
"epoch": 1.7393867924528301,
"grad_norm": 0.7695874571800232,
"learning_rate": 9.29169936822994e-05,
"loss": 0.4356,
"step": 5900
},
{
"epoch": 1.7423349056603774,
"grad_norm": 0.544867992401123,
"learning_rate": 9.289314469672766e-05,
"loss": 0.4086,
"step": 5910
},
{
"epoch": 1.7452830188679245,
"grad_norm": 0.5534322261810303,
"learning_rate": 9.286925869933984e-05,
"loss": 0.4088,
"step": 5920
},
{
"epoch": 1.7482311320754715,
"grad_norm": 0.5867837071418762,
"learning_rate": 9.284533571074682e-05,
"loss": 0.4068,
"step": 5930
},
{
"epoch": 1.7511792452830188,
"grad_norm": 0.6234986186027527,
"learning_rate": 9.282137575159135e-05,
"loss": 0.4126,
"step": 5940
},
{
"epoch": 1.7541273584905661,
"grad_norm": 0.6281249523162842,
"learning_rate": 9.279737884254811e-05,
"loss": 0.4267,
"step": 5950
},
{
"epoch": 1.7570754716981132,
"grad_norm": 0.6814997792243958,
"learning_rate": 9.277334500432364e-05,
"loss": 0.4333,
"step": 5960
},
{
"epoch": 1.7600235849056602,
"grad_norm": 0.6669684052467346,
"learning_rate": 9.274927425765638e-05,
"loss": 0.4203,
"step": 5970
},
{
"epoch": 1.7629716981132075,
"grad_norm": 0.6710285544395447,
"learning_rate": 9.272516662331658e-05,
"loss": 0.4446,
"step": 5980
},
{
"epoch": 1.7659198113207548,
"grad_norm": 0.5774241089820862,
"learning_rate": 9.270102212210632e-05,
"loss": 0.4027,
"step": 5990
},
{
"epoch": 1.7688679245283019,
"grad_norm": 0.7349326014518738,
"learning_rate": 9.267684077485954e-05,
"loss": 0.4076,
"step": 6000
},
{
"epoch": 1.7688679245283019,
"eval_runtime": 2151.7866,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.526,
"step": 6000
},
{
"epoch": 1.771816037735849,
"grad_norm": 0.7598684430122375,
"learning_rate": 9.265262260244193e-05,
"loss": 0.3982,
"step": 6010
},
{
"epoch": 1.7747641509433962,
"grad_norm": 0.6833178400993347,
"learning_rate": 9.262836762575096e-05,
"loss": 0.4024,
"step": 6020
},
{
"epoch": 1.7777122641509435,
"grad_norm": 0.5452631711959839,
"learning_rate": 9.260407586571589e-05,
"loss": 0.4294,
"step": 6030
},
{
"epoch": 1.7806603773584906,
"grad_norm": 0.5070096254348755,
"learning_rate": 9.257974734329766e-05,
"loss": 0.4214,
"step": 6040
},
{
"epoch": 1.7836084905660377,
"grad_norm": 0.5387268662452698,
"learning_rate": 9.255538207948899e-05,
"loss": 0.4111,
"step": 6050
},
{
"epoch": 1.786556603773585,
"grad_norm": 0.506958544254303,
"learning_rate": 9.253098009531428e-05,
"loss": 0.4043,
"step": 6060
},
{
"epoch": 1.7895047169811322,
"grad_norm": 0.6173824667930603,
"learning_rate": 9.250654141182962e-05,
"loss": 0.4235,
"step": 6070
},
{
"epoch": 1.7924528301886793,
"grad_norm": 0.48028188943862915,
"learning_rate": 9.248206605012275e-05,
"loss": 0.4084,
"step": 6080
},
{
"epoch": 1.7954009433962264,
"grad_norm": 0.8442564606666565,
"learning_rate": 9.245755403131307e-05,
"loss": 0.3851,
"step": 6090
},
{
"epoch": 1.7983490566037736,
"grad_norm": 0.6127468347549438,
"learning_rate": 9.243300537655162e-05,
"loss": 0.4273,
"step": 6100
},
{
"epoch": 1.8012971698113207,
"grad_norm": 0.557138979434967,
"learning_rate": 9.240842010702107e-05,
"loss": 0.4255,
"step": 6110
},
{
"epoch": 1.8042452830188678,
"grad_norm": 0.6045234203338623,
"learning_rate": 9.238379824393562e-05,
"loss": 0.4174,
"step": 6120
},
{
"epoch": 1.807193396226415,
"grad_norm": 0.5168408155441284,
"learning_rate": 9.23591398085411e-05,
"loss": 0.3997,
"step": 6130
},
{
"epoch": 1.8101415094339623,
"grad_norm": 1.3076364994049072,
"learning_rate": 9.233444482211488e-05,
"loss": 0.4117,
"step": 6140
},
{
"epoch": 1.8130896226415094,
"grad_norm": 0.7127142548561096,
"learning_rate": 9.23097133059659e-05,
"loss": 0.4334,
"step": 6150
},
{
"epoch": 1.8160377358490565,
"grad_norm": 0.5400004982948303,
"learning_rate": 9.228494528143458e-05,
"loss": 0.3984,
"step": 6160
},
{
"epoch": 1.8189858490566038,
"grad_norm": 0.7215672731399536,
"learning_rate": 9.226014076989283e-05,
"loss": 0.4228,
"step": 6170
},
{
"epoch": 1.821933962264151,
"grad_norm": 0.491253525018692,
"learning_rate": 9.22352997927441e-05,
"loss": 0.4223,
"step": 6180
},
{
"epoch": 1.8248820754716981,
"grad_norm": 0.527469277381897,
"learning_rate": 9.221042237142328e-05,
"loss": 0.4004,
"step": 6190
},
{
"epoch": 1.8278301886792452,
"grad_norm": 0.7260705232620239,
"learning_rate": 9.218550852739668e-05,
"loss": 0.391,
"step": 6200
},
{
"epoch": 1.8307783018867925,
"grad_norm": 0.6503361463546753,
"learning_rate": 9.216055828216208e-05,
"loss": 0.4026,
"step": 6210
},
{
"epoch": 1.8337264150943398,
"grad_norm": 0.645720362663269,
"learning_rate": 9.213557165724865e-05,
"loss": 0.3916,
"step": 6220
},
{
"epoch": 1.8366745283018868,
"grad_norm": 0.8612964749336243,
"learning_rate": 9.211054867421694e-05,
"loss": 0.3955,
"step": 6230
},
{
"epoch": 1.8396226415094339,
"grad_norm": 0.5916104912757874,
"learning_rate": 9.208548935465888e-05,
"loss": 0.4349,
"step": 6240
},
{
"epoch": 1.8425707547169812,
"grad_norm": 0.5687956809997559,
"learning_rate": 9.206039372019778e-05,
"loss": 0.4012,
"step": 6250
},
{
"epoch": 1.8455188679245285,
"grad_norm": 0.6275490522384644,
"learning_rate": 9.203526179248829e-05,
"loss": 0.3954,
"step": 6260
},
{
"epoch": 1.8484669811320755,
"grad_norm": 0.5562242269515991,
"learning_rate": 9.20100935932163e-05,
"loss": 0.4136,
"step": 6270
},
{
"epoch": 1.8514150943396226,
"grad_norm": 0.7400084733963013,
"learning_rate": 9.198488914409908e-05,
"loss": 0.4331,
"step": 6280
},
{
"epoch": 1.8543632075471699,
"grad_norm": 0.6026434898376465,
"learning_rate": 9.195964846688516e-05,
"loss": 0.4073,
"step": 6290
},
{
"epoch": 1.857311320754717,
"grad_norm": 0.6497745513916016,
"learning_rate": 9.19343715833543e-05,
"loss": 0.4221,
"step": 6300
},
{
"epoch": 1.860259433962264,
"grad_norm": 0.5947487354278564,
"learning_rate": 9.190905851531753e-05,
"loss": 0.4381,
"step": 6310
},
{
"epoch": 1.8632075471698113,
"grad_norm": 0.6505824327468872,
"learning_rate": 9.18837092846171e-05,
"loss": 0.4108,
"step": 6320
},
{
"epoch": 1.8661556603773586,
"grad_norm": 0.8787396550178528,
"learning_rate": 9.185832391312644e-05,
"loss": 0.4167,
"step": 6330
},
{
"epoch": 1.8691037735849056,
"grad_norm": 0.805752158164978,
"learning_rate": 9.18329024227502e-05,
"loss": 0.4445,
"step": 6340
},
{
"epoch": 1.8720518867924527,
"grad_norm": 0.6502511501312256,
"learning_rate": 9.18074448354242e-05,
"loss": 0.4413,
"step": 6350
},
{
"epoch": 1.875,
"grad_norm": 0.5842897295951843,
"learning_rate": 9.178195117311536e-05,
"loss": 0.3991,
"step": 6360
},
{
"epoch": 1.8779481132075473,
"grad_norm": 0.632724404335022,
"learning_rate": 9.175642145782179e-05,
"loss": 0.4174,
"step": 6370
},
{
"epoch": 1.8808962264150944,
"grad_norm": 0.6907057762145996,
"learning_rate": 9.173085571157264e-05,
"loss": 0.4136,
"step": 6380
},
{
"epoch": 1.8838443396226414,
"grad_norm": 0.6527078747749329,
"learning_rate": 9.170525395642821e-05,
"loss": 0.4337,
"step": 6390
},
{
"epoch": 1.8867924528301887,
"grad_norm": 0.5634362101554871,
"learning_rate": 9.167961621447985e-05,
"loss": 0.3948,
"step": 6400
},
{
"epoch": 1.889740566037736,
"grad_norm": 0.5869929790496826,
"learning_rate": 9.165394250784995e-05,
"loss": 0.4131,
"step": 6410
},
{
"epoch": 1.892688679245283,
"grad_norm": 0.5383230447769165,
"learning_rate": 9.162823285869198e-05,
"loss": 0.408,
"step": 6420
},
{
"epoch": 1.8956367924528301,
"grad_norm": 0.5522034168243408,
"learning_rate": 9.160248728919034e-05,
"loss": 0.4272,
"step": 6430
},
{
"epoch": 1.8985849056603774,
"grad_norm": 0.7313217520713806,
"learning_rate": 9.15767058215605e-05,
"loss": 0.4324,
"step": 6440
},
{
"epoch": 1.9015330188679245,
"grad_norm": 0.5812812447547913,
"learning_rate": 9.155088847804888e-05,
"loss": 0.4377,
"step": 6450
},
{
"epoch": 1.9044811320754715,
"grad_norm": 0.6667361855506897,
"learning_rate": 9.152503528093285e-05,
"loss": 0.4215,
"step": 6460
},
{
"epoch": 1.9074292452830188,
"grad_norm": 0.5269849300384521,
"learning_rate": 9.149914625252074e-05,
"loss": 0.4219,
"step": 6470
},
{
"epoch": 1.9103773584905661,
"grad_norm": 0.4926372468471527,
"learning_rate": 9.147322141515177e-05,
"loss": 0.4089,
"step": 6480
},
{
"epoch": 1.9133254716981132,
"grad_norm": 0.5828298330307007,
"learning_rate": 9.144726079119607e-05,
"loss": 0.4144,
"step": 6490
},
{
"epoch": 1.9162735849056602,
"grad_norm": 0.4607917070388794,
"learning_rate": 9.142126440305466e-05,
"loss": 0.423,
"step": 6500
},
{
"epoch": 1.9192216981132075,
"grad_norm": 0.6434059739112854,
"learning_rate": 9.139523227315942e-05,
"loss": 0.4154,
"step": 6510
},
{
"epoch": 1.9221698113207548,
"grad_norm": 0.6279144287109375,
"learning_rate": 9.136916442397304e-05,
"loss": 0.4266,
"step": 6520
},
{
"epoch": 1.9251179245283019,
"grad_norm": 0.649456262588501,
"learning_rate": 9.134306087798907e-05,
"loss": 0.4286,
"step": 6530
},
{
"epoch": 1.928066037735849,
"grad_norm": 0.6740241646766663,
"learning_rate": 9.131692165773184e-05,
"loss": 0.4177,
"step": 6540
},
{
"epoch": 1.9310141509433962,
"grad_norm": 0.5082387328147888,
"learning_rate": 9.129074678575649e-05,
"loss": 0.4181,
"step": 6550
},
{
"epoch": 1.9339622641509435,
"grad_norm": 0.6103249788284302,
"learning_rate": 9.126453628464888e-05,
"loss": 0.3862,
"step": 6560
},
{
"epoch": 1.9369103773584906,
"grad_norm": 0.525212287902832,
"learning_rate": 9.123829017702563e-05,
"loss": 0.4075,
"step": 6570
},
{
"epoch": 1.9398584905660377,
"grad_norm": 0.7706009745597839,
"learning_rate": 9.121200848553413e-05,
"loss": 0.4052,
"step": 6580
},
{
"epoch": 1.942806603773585,
"grad_norm": 0.624443531036377,
"learning_rate": 9.118569123285238e-05,
"loss": 0.4184,
"step": 6590
},
{
"epoch": 1.9457547169811322,
"grad_norm": 0.7658302783966064,
"learning_rate": 9.115933844168918e-05,
"loss": 0.4141,
"step": 6600
},
{
"epoch": 1.9487028301886793,
"grad_norm": 0.4986574947834015,
"learning_rate": 9.113295013478389e-05,
"loss": 0.4041,
"step": 6610
},
{
"epoch": 1.9516509433962264,
"grad_norm": 0.6452513933181763,
"learning_rate": 9.110652633490659e-05,
"loss": 0.4035,
"step": 6620
},
{
"epoch": 1.9545990566037736,
"grad_norm": 0.7180289030075073,
"learning_rate": 9.108006706485794e-05,
"loss": 0.3813,
"step": 6630
},
{
"epoch": 1.9575471698113207,
"grad_norm": 0.6559609770774841,
"learning_rate": 9.105357234746925e-05,
"loss": 0.4111,
"step": 6640
},
{
"epoch": 1.9604952830188678,
"grad_norm": 0.6598471403121948,
"learning_rate": 9.102704220560237e-05,
"loss": 0.4115,
"step": 6650
},
{
"epoch": 1.963443396226415,
"grad_norm": 0.6768516898155212,
"learning_rate": 9.100047666214975e-05,
"loss": 0.4241,
"step": 6660
},
{
"epoch": 1.9663915094339623,
"grad_norm": 0.550957441329956,
"learning_rate": 9.097387574003436e-05,
"loss": 0.417,
"step": 6670
},
{
"epoch": 1.9693396226415094,
"grad_norm": 0.8207155466079712,
"learning_rate": 9.094723946220975e-05,
"loss": 0.4099,
"step": 6680
},
{
"epoch": 1.9722877358490565,
"grad_norm": 0.6370198726654053,
"learning_rate": 9.092056785165992e-05,
"loss": 0.4097,
"step": 6690
},
{
"epoch": 1.9752358490566038,
"grad_norm": 0.7340497970581055,
"learning_rate": 9.089386093139937e-05,
"loss": 0.3906,
"step": 6700
},
{
"epoch": 1.978183962264151,
"grad_norm": 0.6578039526939392,
"learning_rate": 9.08671187244731e-05,
"loss": 0.4256,
"step": 6710
},
{
"epoch": 1.9811320754716981,
"grad_norm": 0.5405383706092834,
"learning_rate": 9.084034125395653e-05,
"loss": 0.4037,
"step": 6720
},
{
"epoch": 1.9840801886792452,
"grad_norm": 0.6303375959396362,
"learning_rate": 9.081352854295552e-05,
"loss": 0.4244,
"step": 6730
},
{
"epoch": 1.9870283018867925,
"grad_norm": 0.5610900521278381,
"learning_rate": 9.078668061460632e-05,
"loss": 0.4239,
"step": 6740
},
{
"epoch": 1.9899764150943398,
"grad_norm": 0.7033693790435791,
"learning_rate": 9.075979749207561e-05,
"loss": 0.3992,
"step": 6750
},
{
"epoch": 1.9929245283018868,
"grad_norm": 0.7399113774299622,
"learning_rate": 9.073287919856038e-05,
"loss": 0.4062,
"step": 6760
},
{
"epoch": 1.9958726415094339,
"grad_norm": 0.5513764023780823,
"learning_rate": 9.070592575728801e-05,
"loss": 0.4173,
"step": 6770
},
{
"epoch": 1.9988207547169812,
"grad_norm": 0.5369357466697693,
"learning_rate": 9.067893719151621e-05,
"loss": 0.4153,
"step": 6780
},
{
"epoch": 2.0017688679245285,
"grad_norm": 0.6515247225761414,
"learning_rate": 9.065191352453297e-05,
"loss": 0.4219,
"step": 6790
},
{
"epoch": 2.0047169811320753,
"grad_norm": 0.53224116563797,
"learning_rate": 9.062485477965661e-05,
"loss": 0.4207,
"step": 6800
},
{
"epoch": 2.0076650943396226,
"grad_norm": 0.5387572646141052,
"learning_rate": 9.059776098023567e-05,
"loss": 0.3944,
"step": 6810
},
{
"epoch": 2.01061320754717,
"grad_norm": 0.6075940132141113,
"learning_rate": 9.057063214964896e-05,
"loss": 0.402,
"step": 6820
},
{
"epoch": 2.013561320754717,
"grad_norm": 0.5321000218391418,
"learning_rate": 9.054346831130551e-05,
"loss": 0.403,
"step": 6830
},
{
"epoch": 2.016509433962264,
"grad_norm": 0.5818955302238464,
"learning_rate": 9.05162694886446e-05,
"loss": 0.4004,
"step": 6840
},
{
"epoch": 2.0194575471698113,
"grad_norm": 0.5947965979576111,
"learning_rate": 9.048903570513565e-05,
"loss": 0.3953,
"step": 6850
},
{
"epoch": 2.0224056603773586,
"grad_norm": 0.7267137169837952,
"learning_rate": 9.046176698427824e-05,
"loss": 0.389,
"step": 6860
},
{
"epoch": 2.025353773584906,
"grad_norm": 0.5267451405525208,
"learning_rate": 9.043446334960214e-05,
"loss": 0.3828,
"step": 6870
},
{
"epoch": 2.0283018867924527,
"grad_norm": 0.6209588646888733,
"learning_rate": 9.040712482466723e-05,
"loss": 0.402,
"step": 6880
},
{
"epoch": 2.03125,
"grad_norm": 0.6093756556510925,
"learning_rate": 9.037975143306347e-05,
"loss": 0.3822,
"step": 6890
},
{
"epoch": 2.0341981132075473,
"grad_norm": 0.639741837978363,
"learning_rate": 9.035234319841095e-05,
"loss": 0.4233,
"step": 6900
},
{
"epoch": 2.037146226415094,
"grad_norm": 0.6672208309173584,
"learning_rate": 9.032490014435978e-05,
"loss": 0.3942,
"step": 6910
},
{
"epoch": 2.0400943396226414,
"grad_norm": 0.6828808188438416,
"learning_rate": 9.029742229459015e-05,
"loss": 0.4292,
"step": 6920
},
{
"epoch": 2.0430424528301887,
"grad_norm": 0.6988856196403503,
"learning_rate": 9.026990967281224e-05,
"loss": 0.3917,
"step": 6930
},
{
"epoch": 2.045990566037736,
"grad_norm": 0.6272420883178711,
"learning_rate": 9.024236230276629e-05,
"loss": 0.3932,
"step": 6940
},
{
"epoch": 2.048938679245283,
"grad_norm": 0.6676560640335083,
"learning_rate": 9.021478020822246e-05,
"loss": 0.4225,
"step": 6950
},
{
"epoch": 2.05188679245283,
"grad_norm": 0.6416158676147461,
"learning_rate": 9.018716341298094e-05,
"loss": 0.4046,
"step": 6960
},
{
"epoch": 2.0548349056603774,
"grad_norm": 0.6192477941513062,
"learning_rate": 9.015951194087178e-05,
"loss": 0.4216,
"step": 6970
},
{
"epoch": 2.0577830188679247,
"grad_norm": 0.5341530442237854,
"learning_rate": 9.013182581575503e-05,
"loss": 0.3982,
"step": 6980
},
{
"epoch": 2.0607311320754715,
"grad_norm": 0.6169180274009705,
"learning_rate": 9.01041050615206e-05,
"loss": 0.4176,
"step": 6990
},
{
"epoch": 2.063679245283019,
"grad_norm": 0.6272059679031372,
"learning_rate": 9.007634970208829e-05,
"loss": 0.421,
"step": 7000
},
{
"epoch": 2.063679245283019,
"eval_runtime": 2143.9516,
"eval_samples_per_second": 4.22,
"eval_steps_per_second": 0.528,
"step": 7000
},
{
"epoch": 2.066627358490566,
"grad_norm": 0.5706269145011902,
"learning_rate": 9.004855976140776e-05,
"loss": 0.3974,
"step": 7010
},
{
"epoch": 2.0695754716981134,
"grad_norm": 0.5736098289489746,
"learning_rate": 9.002073526345851e-05,
"loss": 0.4264,
"step": 7020
},
{
"epoch": 2.0725235849056602,
"grad_norm": 0.6415402293205261,
"learning_rate": 8.999287623224986e-05,
"loss": 0.4026,
"step": 7030
},
{
"epoch": 2.0754716981132075,
"grad_norm": 0.7684317827224731,
"learning_rate": 8.996498269182092e-05,
"loss": 0.4224,
"step": 7040
},
{
"epoch": 2.078419811320755,
"grad_norm": 0.6711016893386841,
"learning_rate": 8.99370546662406e-05,
"loss": 0.398,
"step": 7050
},
{
"epoch": 2.081367924528302,
"grad_norm": 0.7194101810455322,
"learning_rate": 8.990909217960754e-05,
"loss": 0.4327,
"step": 7060
},
{
"epoch": 2.084316037735849,
"grad_norm": 0.6347222924232483,
"learning_rate": 8.988109525605015e-05,
"loss": 0.3987,
"step": 7070
},
{
"epoch": 2.0872641509433962,
"grad_norm": 0.7122989892959595,
"learning_rate": 8.98530639197265e-05,
"loss": 0.4134,
"step": 7080
},
{
"epoch": 2.0902122641509435,
"grad_norm": 0.6146140694618225,
"learning_rate": 8.982499819482439e-05,
"loss": 0.4229,
"step": 7090
},
{
"epoch": 2.0931603773584904,
"grad_norm": 0.7223630547523499,
"learning_rate": 8.979689810556131e-05,
"loss": 0.422,
"step": 7100
},
{
"epoch": 2.0961084905660377,
"grad_norm": 0.5322486162185669,
"learning_rate": 8.976876367618437e-05,
"loss": 0.4228,
"step": 7110
},
{
"epoch": 2.099056603773585,
"grad_norm": 0.6254021525382996,
"learning_rate": 8.974059493097034e-05,
"loss": 0.4064,
"step": 7120
},
{
"epoch": 2.1020047169811322,
"grad_norm": 0.9736713767051697,
"learning_rate": 8.971239189422555e-05,
"loss": 0.3805,
"step": 7130
},
{
"epoch": 2.104952830188679,
"grad_norm": 0.6928995251655579,
"learning_rate": 8.968415459028598e-05,
"loss": 0.3931,
"step": 7140
},
{
"epoch": 2.1079009433962264,
"grad_norm": 0.8041650056838989,
"learning_rate": 8.965588304351717e-05,
"loss": 0.4216,
"step": 7150
},
{
"epoch": 2.1108490566037736,
"grad_norm": 0.6400204300880432,
"learning_rate": 8.962757727831414e-05,
"loss": 0.4199,
"step": 7160
},
{
"epoch": 2.113797169811321,
"grad_norm": 0.5853191018104553,
"learning_rate": 8.959923731910154e-05,
"loss": 0.4237,
"step": 7170
},
{
"epoch": 2.1167452830188678,
"grad_norm": 0.5251668095588684,
"learning_rate": 8.957086319033343e-05,
"loss": 0.4198,
"step": 7180
},
{
"epoch": 2.119693396226415,
"grad_norm": 0.6272122859954834,
"learning_rate": 8.954245491649344e-05,
"loss": 0.379,
"step": 7190
},
{
"epoch": 2.1226415094339623,
"grad_norm": 0.6416386961936951,
"learning_rate": 8.951401252209456e-05,
"loss": 0.4228,
"step": 7200
},
{
"epoch": 2.1255896226415096,
"grad_norm": 0.5584208369255066,
"learning_rate": 8.948553603167934e-05,
"loss": 0.4045,
"step": 7210
},
{
"epoch": 2.1285377358490565,
"grad_norm": 0.6313793659210205,
"learning_rate": 8.945702546981969e-05,
"loss": 0.4138,
"step": 7220
},
{
"epoch": 2.1314858490566038,
"grad_norm": 0.6189001202583313,
"learning_rate": 8.942848086111689e-05,
"loss": 0.4138,
"step": 7230
},
{
"epoch": 2.134433962264151,
"grad_norm": 0.6030403971672058,
"learning_rate": 8.939990223020163e-05,
"loss": 0.375,
"step": 7240
},
{
"epoch": 2.137382075471698,
"grad_norm": 0.7220085859298706,
"learning_rate": 8.937128960173399e-05,
"loss": 0.4023,
"step": 7250
},
{
"epoch": 2.140330188679245,
"grad_norm": 0.6200517416000366,
"learning_rate": 8.934264300040333e-05,
"loss": 0.4126,
"step": 7260
},
{
"epoch": 2.1432783018867925,
"grad_norm": 0.921150803565979,
"learning_rate": 8.931396245092835e-05,
"loss": 0.3922,
"step": 7270
},
{
"epoch": 2.1462264150943398,
"grad_norm": 0.5805622935295105,
"learning_rate": 8.928524797805706e-05,
"loss": 0.4263,
"step": 7280
},
{
"epoch": 2.1491745283018866,
"grad_norm": 0.5072749853134155,
"learning_rate": 8.925649960656673e-05,
"loss": 0.3982,
"step": 7290
},
{
"epoch": 2.152122641509434,
"grad_norm": 0.519446074962616,
"learning_rate": 8.922771736126383e-05,
"loss": 0.3887,
"step": 7300
},
{
"epoch": 2.155070754716981,
"grad_norm": 0.683449923992157,
"learning_rate": 8.919890126698416e-05,
"loss": 0.4143,
"step": 7310
},
{
"epoch": 2.1580188679245285,
"grad_norm": 0.7214087247848511,
"learning_rate": 8.917005134859263e-05,
"loss": 0.4121,
"step": 7320
},
{
"epoch": 2.1609669811320753,
"grad_norm": 0.5989852547645569,
"learning_rate": 8.914116763098339e-05,
"loss": 0.4157,
"step": 7330
},
{
"epoch": 2.1639150943396226,
"grad_norm": 0.5464280843734741,
"learning_rate": 8.911225013907976e-05,
"loss": 0.4418,
"step": 7340
},
{
"epoch": 2.16686320754717,
"grad_norm": 0.6044255495071411,
"learning_rate": 8.908329889783418e-05,
"loss": 0.387,
"step": 7350
},
{
"epoch": 2.169811320754717,
"grad_norm": 0.8473643660545349,
"learning_rate": 8.905431393222819e-05,
"loss": 0.4114,
"step": 7360
},
{
"epoch": 2.172759433962264,
"grad_norm": 0.5799746513366699,
"learning_rate": 8.902529526727247e-05,
"loss": 0.4212,
"step": 7370
},
{
"epoch": 2.1757075471698113,
"grad_norm": 0.649660587310791,
"learning_rate": 8.899624292800681e-05,
"loss": 0.4204,
"step": 7380
},
{
"epoch": 2.1786556603773586,
"grad_norm": 0.6192954778671265,
"learning_rate": 8.896715693949997e-05,
"loss": 0.3812,
"step": 7390
},
{
"epoch": 2.1816037735849054,
"grad_norm": 0.6519819498062134,
"learning_rate": 8.89380373268498e-05,
"loss": 0.3974,
"step": 7400
},
{
"epoch": 2.1845518867924527,
"grad_norm": 0.6003890037536621,
"learning_rate": 8.890888411518315e-05,
"loss": 0.4126,
"step": 7410
},
{
"epoch": 2.1875,
"grad_norm": 0.6291475892066956,
"learning_rate": 8.887969732965587e-05,
"loss": 0.4141,
"step": 7420
},
{
"epoch": 2.1904481132075473,
"grad_norm": 0.657136857509613,
"learning_rate": 8.885047699545277e-05,
"loss": 0.4106,
"step": 7430
},
{
"epoch": 2.1933962264150946,
"grad_norm": 0.5920342206954956,
"learning_rate": 8.882122313778762e-05,
"loss": 0.3908,
"step": 7440
},
{
"epoch": 2.1963443396226414,
"grad_norm": 0.6525407433509827,
"learning_rate": 8.87919357819031e-05,
"loss": 0.4,
"step": 7450
},
{
"epoch": 2.1992924528301887,
"grad_norm": 0.6435882449150085,
"learning_rate": 8.876261495307083e-05,
"loss": 0.4095,
"step": 7460
},
{
"epoch": 2.202240566037736,
"grad_norm": 0.5499066710472107,
"learning_rate": 8.873326067659127e-05,
"loss": 0.4111,
"step": 7470
},
{
"epoch": 2.205188679245283,
"grad_norm": 0.6083002686500549,
"learning_rate": 8.870387297779377e-05,
"loss": 0.3942,
"step": 7480
},
{
"epoch": 2.20813679245283,
"grad_norm": 1.4924118518829346,
"learning_rate": 8.86744518820365e-05,
"loss": 0.422,
"step": 7490
},
{
"epoch": 2.2110849056603774,
"grad_norm": 0.611552894115448,
"learning_rate": 8.864499741470645e-05,
"loss": 0.4118,
"step": 7500
},
{
"epoch": 2.2140330188679247,
"grad_norm": 0.5591704249382019,
"learning_rate": 8.861550960121945e-05,
"loss": 0.3981,
"step": 7510
},
{
"epoch": 2.2169811320754715,
"grad_norm": 0.6003575921058655,
"learning_rate": 8.858598846702005e-05,
"loss": 0.4164,
"step": 7520
},
{
"epoch": 2.219929245283019,
"grad_norm": 0.6059021949768066,
"learning_rate": 8.855643403758153e-05,
"loss": 0.3945,
"step": 7530
},
{
"epoch": 2.222877358490566,
"grad_norm": 0.612736701965332,
"learning_rate": 8.852684633840601e-05,
"loss": 0.4151,
"step": 7540
},
{
"epoch": 2.2258254716981134,
"grad_norm": 0.6779835820198059,
"learning_rate": 8.84972253950242e-05,
"loss": 0.3663,
"step": 7550
},
{
"epoch": 2.2287735849056602,
"grad_norm": 0.7399649024009705,
"learning_rate": 8.846757123299556e-05,
"loss": 0.3909,
"step": 7560
},
{
"epoch": 2.2317216981132075,
"grad_norm": 0.9876378774642944,
"learning_rate": 8.84378838779082e-05,
"loss": 0.4119,
"step": 7570
},
{
"epoch": 2.234669811320755,
"grad_norm": 0.49867725372314453,
"learning_rate": 8.840816335537885e-05,
"loss": 0.3779,
"step": 7580
},
{
"epoch": 2.237617924528302,
"grad_norm": 0.7300730347633362,
"learning_rate": 8.837840969105289e-05,
"loss": 0.4038,
"step": 7590
},
{
"epoch": 2.240566037735849,
"grad_norm": 0.5498932600021362,
"learning_rate": 8.834862291060428e-05,
"loss": 0.3978,
"step": 7600
},
{
"epoch": 2.2435141509433962,
"grad_norm": 0.5018678903579712,
"learning_rate": 8.831880303973558e-05,
"loss": 0.3941,
"step": 7610
},
{
"epoch": 2.2464622641509435,
"grad_norm": 0.727459192276001,
"learning_rate": 8.828895010417789e-05,
"loss": 0.4017,
"step": 7620
},
{
"epoch": 2.2494103773584904,
"grad_norm": 0.5887342691421509,
"learning_rate": 8.82590641296908e-05,
"loss": 0.4283,
"step": 7630
},
{
"epoch": 2.2523584905660377,
"grad_norm": 0.5773307085037231,
"learning_rate": 8.822914514206248e-05,
"loss": 0.3967,
"step": 7640
},
{
"epoch": 2.255306603773585,
"grad_norm": 0.5532861948013306,
"learning_rate": 8.819919316710953e-05,
"loss": 0.4058,
"step": 7650
},
{
"epoch": 2.2582547169811322,
"grad_norm": 0.5850034952163696,
"learning_rate": 8.816920823067703e-05,
"loss": 0.3813,
"step": 7660
},
{
"epoch": 2.261202830188679,
"grad_norm": 0.5167927742004395,
"learning_rate": 8.813919035863854e-05,
"loss": 0.4068,
"step": 7670
},
{
"epoch": 2.2641509433962264,
"grad_norm": 0.698789119720459,
"learning_rate": 8.810913957689597e-05,
"loss": 0.3995,
"step": 7680
},
{
"epoch": 2.2670990566037736,
"grad_norm": 0.6801536679267883,
"learning_rate": 8.807905591137969e-05,
"loss": 0.4048,
"step": 7690
},
{
"epoch": 2.270047169811321,
"grad_norm": 0.5517321228981018,
"learning_rate": 8.80489393880484e-05,
"loss": 0.3902,
"step": 7700
},
{
"epoch": 2.2729952830188678,
"grad_norm": 0.5244399905204773,
"learning_rate": 8.801879003288918e-05,
"loss": 0.4005,
"step": 7710
},
{
"epoch": 2.275943396226415,
"grad_norm": 0.6431949734687805,
"learning_rate": 8.798860787191743e-05,
"loss": 0.4031,
"step": 7720
},
{
"epoch": 2.2788915094339623,
"grad_norm": 0.5005143880844116,
"learning_rate": 8.795839293117687e-05,
"loss": 0.4194,
"step": 7730
},
{
"epoch": 2.2818396226415096,
"grad_norm": 0.5353305339813232,
"learning_rate": 8.792814523673948e-05,
"loss": 0.3872,
"step": 7740
},
{
"epoch": 2.2847877358490565,
"grad_norm": 0.7902474403381348,
"learning_rate": 8.789786481470553e-05,
"loss": 0.4177,
"step": 7750
},
{
"epoch": 2.2877358490566038,
"grad_norm": 0.5796612501144409,
"learning_rate": 8.786755169120351e-05,
"loss": 0.3727,
"step": 7760
},
{
"epoch": 2.290683962264151,
"grad_norm": 0.6347728371620178,
"learning_rate": 8.783720589239013e-05,
"loss": 0.3885,
"step": 7770
},
{
"epoch": 2.293632075471698,
"grad_norm": 0.5600293874740601,
"learning_rate": 8.78068274444503e-05,
"loss": 0.3798,
"step": 7780
},
{
"epoch": 2.296580188679245,
"grad_norm": 0.7907067537307739,
"learning_rate": 8.777641637359711e-05,
"loss": 0.4045,
"step": 7790
},
{
"epoch": 2.2995283018867925,
"grad_norm": 0.6473096013069153,
"learning_rate": 8.774597270607178e-05,
"loss": 0.4169,
"step": 7800
},
{
"epoch": 2.3024764150943398,
"grad_norm": 0.5632277131080627,
"learning_rate": 8.77154964681437e-05,
"loss": 0.4205,
"step": 7810
},
{
"epoch": 2.3054245283018866,
"grad_norm": 0.6060307621955872,
"learning_rate": 8.76849876861103e-05,
"loss": 0.4054,
"step": 7820
},
{
"epoch": 2.308372641509434,
"grad_norm": 0.6059081554412842,
"learning_rate": 8.765444638629715e-05,
"loss": 0.3893,
"step": 7830
},
{
"epoch": 2.311320754716981,
"grad_norm": 0.658637285232544,
"learning_rate": 8.762387259505783e-05,
"loss": 0.3976,
"step": 7840
},
{
"epoch": 2.3142688679245285,
"grad_norm": 0.679071307182312,
"learning_rate": 8.759326633877398e-05,
"loss": 0.3956,
"step": 7850
},
{
"epoch": 2.3172169811320753,
"grad_norm": 0.9143447279930115,
"learning_rate": 8.756262764385528e-05,
"loss": 0.4041,
"step": 7860
},
{
"epoch": 2.3201650943396226,
"grad_norm": 0.6284430623054504,
"learning_rate": 8.753195653673936e-05,
"loss": 0.428,
"step": 7870
},
{
"epoch": 2.32311320754717,
"grad_norm": 0.7159178256988525,
"learning_rate": 8.750125304389183e-05,
"loss": 0.4138,
"step": 7880
},
{
"epoch": 2.326061320754717,
"grad_norm": 0.6185059547424316,
"learning_rate": 8.747051719180626e-05,
"loss": 0.3965,
"step": 7890
},
{
"epoch": 2.329009433962264,
"grad_norm": 0.6494713425636292,
"learning_rate": 8.743974900700414e-05,
"loss": 0.4063,
"step": 7900
},
{
"epoch": 2.3319575471698113,
"grad_norm": 0.48549047112464905,
"learning_rate": 8.74089485160348e-05,
"loss": 0.3944,
"step": 7910
},
{
"epoch": 2.3349056603773586,
"grad_norm": 0.6528279781341553,
"learning_rate": 8.737811574547556e-05,
"loss": 0.3843,
"step": 7920
},
{
"epoch": 2.3378537735849054,
"grad_norm": 0.614043116569519,
"learning_rate": 8.734725072193149e-05,
"loss": 0.3928,
"step": 7930
},
{
"epoch": 2.3408018867924527,
"grad_norm": 0.5889528393745422,
"learning_rate": 8.731635347203552e-05,
"loss": 0.4209,
"step": 7940
},
{
"epoch": 2.34375,
"grad_norm": 0.632585346698761,
"learning_rate": 8.728542402244846e-05,
"loss": 0.3885,
"step": 7950
},
{
"epoch": 2.3466981132075473,
"grad_norm": 0.6582772731781006,
"learning_rate": 8.725446239985877e-05,
"loss": 0.4077,
"step": 7960
},
{
"epoch": 2.3496462264150946,
"grad_norm": 0.4948737621307373,
"learning_rate": 8.722346863098279e-05,
"loss": 0.4143,
"step": 7970
},
{
"epoch": 2.3525943396226414,
"grad_norm": 0.6382238268852234,
"learning_rate": 8.719244274256452e-05,
"loss": 0.3918,
"step": 7980
},
{
"epoch": 2.3555424528301887,
"grad_norm": 0.6675527691841125,
"learning_rate": 8.716138476137577e-05,
"loss": 0.3983,
"step": 7990
},
{
"epoch": 2.358490566037736,
"grad_norm": 0.579544723033905,
"learning_rate": 8.71302947142159e-05,
"loss": 0.4064,
"step": 8000
},
{
"epoch": 2.358490566037736,
"eval_runtime": 2159.7483,
"eval_samples_per_second": 4.189,
"eval_steps_per_second": 0.524,
"step": 8000
},
{
"epoch": 2.361438679245283,
"grad_norm": 0.5494331121444702,
"learning_rate": 8.709917262791207e-05,
"loss": 0.3604,
"step": 8010
},
{
"epoch": 2.36438679245283,
"grad_norm": 0.7046252489089966,
"learning_rate": 8.706801852931903e-05,
"loss": 0.4144,
"step": 8020
},
{
"epoch": 2.3673349056603774,
"grad_norm": 0.7568420767784119,
"learning_rate": 8.703683244531915e-05,
"loss": 0.4299,
"step": 8030
},
{
"epoch": 2.3702830188679247,
"grad_norm": 0.5755845904350281,
"learning_rate": 8.70056144028224e-05,
"loss": 0.4012,
"step": 8040
},
{
"epoch": 2.3732311320754715,
"grad_norm": 0.6116190552711487,
"learning_rate": 8.697436442876636e-05,
"loss": 0.4131,
"step": 8050
},
{
"epoch": 2.376179245283019,
"grad_norm": 0.5517948865890503,
"learning_rate": 8.694308255011611e-05,
"loss": 0.396,
"step": 8060
},
{
"epoch": 2.379127358490566,
"grad_norm": 0.5410232543945312,
"learning_rate": 8.691176879386433e-05,
"loss": 0.3949,
"step": 8070
},
{
"epoch": 2.3820754716981134,
"grad_norm": 0.5844671130180359,
"learning_rate": 8.688042318703111e-05,
"loss": 0.4003,
"step": 8080
},
{
"epoch": 2.3850235849056602,
"grad_norm": 0.485344260931015,
"learning_rate": 8.684904575666413e-05,
"loss": 0.3813,
"step": 8090
},
{
"epoch": 2.3879716981132075,
"grad_norm": 0.6024196743965149,
"learning_rate": 8.681763652983845e-05,
"loss": 0.3806,
"step": 8100
},
{
"epoch": 2.390919811320755,
"grad_norm": 0.5690436363220215,
"learning_rate": 8.678619553365659e-05,
"loss": 0.3722,
"step": 8110
},
{
"epoch": 2.393867924528302,
"grad_norm": 0.6716231107711792,
"learning_rate": 8.67547227952485e-05,
"loss": 0.4145,
"step": 8120
},
{
"epoch": 2.396816037735849,
"grad_norm": 0.6588793396949768,
"learning_rate": 8.67232183417715e-05,
"loss": 0.4062,
"step": 8130
},
{
"epoch": 2.3997641509433962,
"grad_norm": 0.5847112536430359,
"learning_rate": 8.669168220041031e-05,
"loss": 0.3733,
"step": 8140
},
{
"epoch": 2.4027122641509435,
"grad_norm": 0.6627358794212341,
"learning_rate": 8.666011439837693e-05,
"loss": 0.4039,
"step": 8150
},
{
"epoch": 2.4056603773584904,
"grad_norm": 0.6800193190574646,
"learning_rate": 8.662851496291074e-05,
"loss": 0.4102,
"step": 8160
},
{
"epoch": 2.4086084905660377,
"grad_norm": 0.7622256278991699,
"learning_rate": 8.65968839212784e-05,
"loss": 0.4124,
"step": 8170
},
{
"epoch": 2.411556603773585,
"grad_norm": 0.6789641380310059,
"learning_rate": 8.656522130077382e-05,
"loss": 0.3917,
"step": 8180
},
{
"epoch": 2.4145047169811322,
"grad_norm": 0.5745140910148621,
"learning_rate": 8.653352712871816e-05,
"loss": 0.387,
"step": 8190
},
{
"epoch": 2.417452830188679,
"grad_norm": 0.5641975998878479,
"learning_rate": 8.650180143245986e-05,
"loss": 0.3791,
"step": 8200
},
{
"epoch": 2.4204009433962264,
"grad_norm": 0.4928918778896332,
"learning_rate": 8.647004423937448e-05,
"loss": 0.4077,
"step": 8210
},
{
"epoch": 2.4233490566037736,
"grad_norm": 0.5007837414741516,
"learning_rate": 8.643825557686484e-05,
"loss": 0.4269,
"step": 8220
},
{
"epoch": 2.426297169811321,
"grad_norm": 0.5590235590934753,
"learning_rate": 8.640643547236085e-05,
"loss": 0.3574,
"step": 8230
},
{
"epoch": 2.4292452830188678,
"grad_norm": 0.7226880788803101,
"learning_rate": 8.637458395331956e-05,
"loss": 0.4275,
"step": 8240
},
{
"epoch": 2.432193396226415,
"grad_norm": 0.5810802578926086,
"learning_rate": 8.634270104722518e-05,
"loss": 0.3731,
"step": 8250
},
{
"epoch": 2.4351415094339623,
"grad_norm": 0.6347400546073914,
"learning_rate": 8.631078678158893e-05,
"loss": 0.3876,
"step": 8260
},
{
"epoch": 2.4380896226415096,
"grad_norm": 0.6214938759803772,
"learning_rate": 8.627884118394913e-05,
"loss": 0.4024,
"step": 8270
},
{
"epoch": 2.4410377358490565,
"grad_norm": 0.5793152451515198,
"learning_rate": 8.624686428187117e-05,
"loss": 0.4039,
"step": 8280
},
{
"epoch": 2.4439858490566038,
"grad_norm": 0.6397968530654907,
"learning_rate": 8.621485610294737e-05,
"loss": 0.3953,
"step": 8290
},
{
"epoch": 2.446933962264151,
"grad_norm": 0.9334049820899963,
"learning_rate": 8.618281667479708e-05,
"loss": 0.3952,
"step": 8300
},
{
"epoch": 2.449882075471698,
"grad_norm": 0.5289429426193237,
"learning_rate": 8.615074602506665e-05,
"loss": 0.4053,
"step": 8310
},
{
"epoch": 2.452830188679245,
"grad_norm": 0.44878268241882324,
"learning_rate": 8.611864418142931e-05,
"loss": 0.3999,
"step": 8320
},
{
"epoch": 2.4557783018867925,
"grad_norm": 0.5765179991722107,
"learning_rate": 8.608651117158526e-05,
"loss": 0.4133,
"step": 8330
},
{
"epoch": 2.4587264150943398,
"grad_norm": 0.8636899590492249,
"learning_rate": 8.605434702326153e-05,
"loss": 0.4284,
"step": 8340
},
{
"epoch": 2.4616745283018866,
"grad_norm": 0.7069612145423889,
"learning_rate": 8.602215176421206e-05,
"loss": 0.39,
"step": 8350
},
{
"epoch": 2.464622641509434,
"grad_norm": 0.5607879757881165,
"learning_rate": 8.598992542221766e-05,
"loss": 0.3919,
"step": 8360
},
{
"epoch": 2.467570754716981,
"grad_norm": 0.6326379179954529,
"learning_rate": 8.595766802508591e-05,
"loss": 0.4273,
"step": 8370
},
{
"epoch": 2.4705188679245285,
"grad_norm": 0.5385253429412842,
"learning_rate": 8.59253796006512e-05,
"loss": 0.4008,
"step": 8380
},
{
"epoch": 2.4734669811320753,
"grad_norm": 0.6334646344184875,
"learning_rate": 8.589306017677472e-05,
"loss": 0.4174,
"step": 8390
},
{
"epoch": 2.4764150943396226,
"grad_norm": 0.6317793130874634,
"learning_rate": 8.586070978134437e-05,
"loss": 0.4232,
"step": 8400
},
{
"epoch": 2.47936320754717,
"grad_norm": 0.498748242855072,
"learning_rate": 8.58283284422748e-05,
"loss": 0.418,
"step": 8410
},
{
"epoch": 2.482311320754717,
"grad_norm": 0.6666170954704285,
"learning_rate": 8.579591618750737e-05,
"loss": 0.4058,
"step": 8420
},
{
"epoch": 2.485259433962264,
"grad_norm": 0.662187397480011,
"learning_rate": 8.576347304501009e-05,
"loss": 0.3868,
"step": 8430
},
{
"epoch": 2.4882075471698113,
"grad_norm": 0.8076348900794983,
"learning_rate": 8.573099904277764e-05,
"loss": 0.4102,
"step": 8440
},
{
"epoch": 2.4911556603773586,
"grad_norm": 0.7696743011474609,
"learning_rate": 8.569849420883131e-05,
"loss": 0.3902,
"step": 8450
},
{
"epoch": 2.4941037735849054,
"grad_norm": 0.5895074009895325,
"learning_rate": 8.566595857121902e-05,
"loss": 0.4195,
"step": 8460
},
{
"epoch": 2.4970518867924527,
"grad_norm": 0.4605101943016052,
"learning_rate": 8.563339215801525e-05,
"loss": 0.3924,
"step": 8470
},
{
"epoch": 2.5,
"grad_norm": 0.691888153553009,
"learning_rate": 8.560079499732105e-05,
"loss": 0.4181,
"step": 8480
},
{
"epoch": 2.5029481132075473,
"grad_norm": 0.6193966269493103,
"learning_rate": 8.556816711726399e-05,
"loss": 0.3876,
"step": 8490
},
{
"epoch": 2.5058962264150946,
"grad_norm": 0.6037969589233398,
"learning_rate": 8.553550854599815e-05,
"loss": 0.4085,
"step": 8500
},
{
"epoch": 2.5088443396226414,
"grad_norm": 0.6456816792488098,
"learning_rate": 8.550281931170408e-05,
"loss": 0.3741,
"step": 8510
},
{
"epoch": 2.5117924528301887,
"grad_norm": 0.6365756988525391,
"learning_rate": 8.547009944258884e-05,
"loss": 0.3922,
"step": 8520
},
{
"epoch": 2.514740566037736,
"grad_norm": 0.591224193572998,
"learning_rate": 8.543734896688585e-05,
"loss": 0.3929,
"step": 8530
},
{
"epoch": 2.517688679245283,
"grad_norm": 0.6187371611595154,
"learning_rate": 8.5404567912855e-05,
"loss": 0.3985,
"step": 8540
},
{
"epoch": 2.52063679245283,
"grad_norm": 0.6070706844329834,
"learning_rate": 8.537175630878256e-05,
"loss": 0.4115,
"step": 8550
},
{
"epoch": 2.5235849056603774,
"grad_norm": 0.6325767636299133,
"learning_rate": 8.53389141829811e-05,
"loss": 0.4153,
"step": 8560
},
{
"epoch": 2.5265330188679247,
"grad_norm": 0.6173579096794128,
"learning_rate": 8.530604156378959e-05,
"loss": 0.4096,
"step": 8570
},
{
"epoch": 2.5294811320754715,
"grad_norm": 0.678013265132904,
"learning_rate": 8.52731384795733e-05,
"loss": 0.4123,
"step": 8580
},
{
"epoch": 2.532429245283019,
"grad_norm": 0.6726073026657104,
"learning_rate": 8.524020495872378e-05,
"loss": 0.4203,
"step": 8590
},
{
"epoch": 2.535377358490566,
"grad_norm": 0.6313995122909546,
"learning_rate": 8.520724102965882e-05,
"loss": 0.4235,
"step": 8600
},
{
"epoch": 2.538325471698113,
"grad_norm": 0.757446825504303,
"learning_rate": 8.517424672082253e-05,
"loss": 0.4275,
"step": 8610
},
{
"epoch": 2.5412735849056602,
"grad_norm": 0.6651310920715332,
"learning_rate": 8.514122206068511e-05,
"loss": 0.3916,
"step": 8620
},
{
"epoch": 2.5442216981132075,
"grad_norm": 0.613274097442627,
"learning_rate": 8.510816707774306e-05,
"loss": 0.3893,
"step": 8630
},
{
"epoch": 2.547169811320755,
"grad_norm": 0.6147936582565308,
"learning_rate": 8.507508180051899e-05,
"loss": 0.4031,
"step": 8640
},
{
"epoch": 2.550117924528302,
"grad_norm": 0.6389046907424927,
"learning_rate": 8.504196625756166e-05,
"loss": 0.3662,
"step": 8650
},
{
"epoch": 2.553066037735849,
"grad_norm": 0.6270299553871155,
"learning_rate": 8.500882047744594e-05,
"loss": 0.4019,
"step": 8660
},
{
"epoch": 2.5560141509433962,
"grad_norm": 0.566421389579773,
"learning_rate": 8.497564448877282e-05,
"loss": 0.4005,
"step": 8670
},
{
"epoch": 2.5589622641509435,
"grad_norm": 0.7032985091209412,
"learning_rate": 8.494243832016933e-05,
"loss": 0.4209,
"step": 8680
},
{
"epoch": 2.5619103773584904,
"grad_norm": 0.64497309923172,
"learning_rate": 8.490920200028854e-05,
"loss": 0.3987,
"step": 8690
},
{
"epoch": 2.5648584905660377,
"grad_norm": 0.7354119420051575,
"learning_rate": 8.487593555780953e-05,
"loss": 0.4141,
"step": 8700
},
{
"epoch": 2.567806603773585,
"grad_norm": 1.4042811393737793,
"learning_rate": 8.484263902143741e-05,
"loss": 0.4089,
"step": 8710
},
{
"epoch": 2.5707547169811322,
"grad_norm": 0.6055272817611694,
"learning_rate": 8.480931241990324e-05,
"loss": 0.416,
"step": 8720
},
{
"epoch": 2.5737028301886795,
"grad_norm": 0.6292642951011658,
"learning_rate": 8.4775955781964e-05,
"loss": 0.3717,
"step": 8730
},
{
"epoch": 2.5766509433962264,
"grad_norm": 0.6712636947631836,
"learning_rate": 8.474256913640262e-05,
"loss": 0.396,
"step": 8740
},
{
"epoch": 2.5795990566037736,
"grad_norm": 0.4655943214893341,
"learning_rate": 8.470915251202789e-05,
"loss": 0.4103,
"step": 8750
},
{
"epoch": 2.5825471698113205,
"grad_norm": 0.8315421938896179,
"learning_rate": 8.467570593767449e-05,
"loss": 0.4226,
"step": 8760
},
{
"epoch": 2.5854952830188678,
"grad_norm": 0.7628483176231384,
"learning_rate": 8.464222944220296e-05,
"loss": 0.3949,
"step": 8770
},
{
"epoch": 2.588443396226415,
"grad_norm": 0.6963688135147095,
"learning_rate": 8.460872305449962e-05,
"loss": 0.3974,
"step": 8780
},
{
"epoch": 2.5913915094339623,
"grad_norm": 0.6601455807685852,
"learning_rate": 8.45751868034766e-05,
"loss": 0.4135,
"step": 8790
},
{
"epoch": 2.5943396226415096,
"grad_norm": 0.5189196467399597,
"learning_rate": 8.454162071807181e-05,
"loss": 0.387,
"step": 8800
},
{
"epoch": 2.5972877358490565,
"grad_norm": 0.5418815612792969,
"learning_rate": 8.450802482724888e-05,
"loss": 0.417,
"step": 8810
},
{
"epoch": 2.6002358490566038,
"grad_norm": 0.5572226643562317,
"learning_rate": 8.447439915999721e-05,
"loss": 0.3862,
"step": 8820
},
{
"epoch": 2.603183962264151,
"grad_norm": 0.5427402257919312,
"learning_rate": 8.444074374533181e-05,
"loss": 0.3835,
"step": 8830
},
{
"epoch": 2.606132075471698,
"grad_norm": 0.5943536758422852,
"learning_rate": 8.440705861229344e-05,
"loss": 0.391,
"step": 8840
},
{
"epoch": 2.609080188679245,
"grad_norm": 0.6809098124504089,
"learning_rate": 8.437334378994845e-05,
"loss": 0.3898,
"step": 8850
},
{
"epoch": 2.6120283018867925,
"grad_norm": 0.5982612371444702,
"learning_rate": 8.433959930738881e-05,
"loss": 0.4154,
"step": 8860
},
{
"epoch": 2.6149764150943398,
"grad_norm": 0.664586067199707,
"learning_rate": 8.430582519373215e-05,
"loss": 0.4151,
"step": 8870
},
{
"epoch": 2.617924528301887,
"grad_norm": 0.6811748743057251,
"learning_rate": 8.427202147812159e-05,
"loss": 0.4177,
"step": 8880
},
{
"epoch": 2.620872641509434,
"grad_norm": 0.5431303977966309,
"learning_rate": 8.42381881897258e-05,
"loss": 0.4044,
"step": 8890
},
{
"epoch": 2.623820754716981,
"grad_norm": 0.5670378804206848,
"learning_rate": 8.420432535773901e-05,
"loss": 0.3978,
"step": 8900
},
{
"epoch": 2.6267688679245285,
"grad_norm": 0.4603041708469391,
"learning_rate": 8.417043301138094e-05,
"loss": 0.4099,
"step": 8910
},
{
"epoch": 2.6297169811320753,
"grad_norm": 0.714662492275238,
"learning_rate": 8.413651117989673e-05,
"loss": 0.4,
"step": 8920
},
{
"epoch": 2.6326650943396226,
"grad_norm": 0.4563981592655182,
"learning_rate": 8.4102559892557e-05,
"loss": 0.4113,
"step": 8930
},
{
"epoch": 2.63561320754717,
"grad_norm": 0.9415069818496704,
"learning_rate": 8.40685791786578e-05,
"loss": 0.3769,
"step": 8940
},
{
"epoch": 2.638561320754717,
"grad_norm": 0.6446280479431152,
"learning_rate": 8.403456906752053e-05,
"loss": 0.3752,
"step": 8950
},
{
"epoch": 2.641509433962264,
"grad_norm": 0.744600236415863,
"learning_rate": 8.400052958849197e-05,
"loss": 0.4042,
"step": 8960
},
{
"epoch": 2.6444575471698113,
"grad_norm": 0.557375967502594,
"learning_rate": 8.396646077094424e-05,
"loss": 0.3832,
"step": 8970
},
{
"epoch": 2.6474056603773586,
"grad_norm": 0.630577802658081,
"learning_rate": 8.393236264427482e-05,
"loss": 0.4047,
"step": 8980
},
{
"epoch": 2.6503537735849054,
"grad_norm": 0.5203380584716797,
"learning_rate": 8.389823523790643e-05,
"loss": 0.4014,
"step": 8990
},
{
"epoch": 2.6533018867924527,
"grad_norm": 0.5568553805351257,
"learning_rate": 8.386407858128706e-05,
"loss": 0.3972,
"step": 9000
},
{
"epoch": 2.6533018867924527,
"eval_runtime": 2157.0263,
"eval_samples_per_second": 4.194,
"eval_steps_per_second": 0.524,
"step": 9000
},
{
"epoch": 2.65625,
"grad_norm": 0.5672769546508789,
"learning_rate": 8.382989270388996e-05,
"loss": 0.3659,
"step": 9010
},
{
"epoch": 2.6591981132075473,
"grad_norm": 0.6042519211769104,
"learning_rate": 8.379567763521356e-05,
"loss": 0.3884,
"step": 9020
},
{
"epoch": 2.6621462264150946,
"grad_norm": 0.5790214538574219,
"learning_rate": 8.376143340478153e-05,
"loss": 0.3886,
"step": 9030
},
{
"epoch": 2.6650943396226414,
"grad_norm": 0.5996211767196655,
"learning_rate": 8.372716004214266e-05,
"loss": 0.378,
"step": 9040
},
{
"epoch": 2.6680424528301887,
"grad_norm": 0.6784074306488037,
"learning_rate": 8.36928575768709e-05,
"loss": 0.3892,
"step": 9050
},
{
"epoch": 2.670990566037736,
"grad_norm": 0.6817885041236877,
"learning_rate": 8.365852603856529e-05,
"loss": 0.408,
"step": 9060
},
{
"epoch": 2.673938679245283,
"grad_norm": 0.7811933755874634,
"learning_rate": 8.362416545684999e-05,
"loss": 0.4157,
"step": 9070
},
{
"epoch": 2.67688679245283,
"grad_norm": 0.762050211429596,
"learning_rate": 8.358977586137419e-05,
"loss": 0.3944,
"step": 9080
},
{
"epoch": 2.6798349056603774,
"grad_norm": 0.58027184009552,
"learning_rate": 8.355535728181212e-05,
"loss": 0.4057,
"step": 9090
},
{
"epoch": 2.6827830188679247,
"grad_norm": 0.6728706359863281,
"learning_rate": 8.352090974786305e-05,
"loss": 0.4037,
"step": 9100
},
{
"epoch": 2.6857311320754715,
"grad_norm": 0.5518704056739807,
"learning_rate": 8.34864332892512e-05,
"loss": 0.4258,
"step": 9110
},
{
"epoch": 2.688679245283019,
"grad_norm": 0.6891208291053772,
"learning_rate": 8.345192793572577e-05,
"loss": 0.4144,
"step": 9120
},
{
"epoch": 2.691627358490566,
"grad_norm": 0.7742394208908081,
"learning_rate": 8.341739371706087e-05,
"loss": 0.4065,
"step": 9130
},
{
"epoch": 2.694575471698113,
"grad_norm": 0.5380538105964661,
"learning_rate": 8.338283066305555e-05,
"loss": 0.4236,
"step": 9140
},
{
"epoch": 2.6975235849056602,
"grad_norm": 0.6107019782066345,
"learning_rate": 8.334823880353369e-05,
"loss": 0.399,
"step": 9150
},
{
"epoch": 2.7004716981132075,
"grad_norm": 0.6730389595031738,
"learning_rate": 8.331361816834408e-05,
"loss": 0.4282,
"step": 9160
},
{
"epoch": 2.703419811320755,
"grad_norm": 0.6174814701080322,
"learning_rate": 8.327896878736032e-05,
"loss": 0.4033,
"step": 9170
},
{
"epoch": 2.706367924528302,
"grad_norm": 0.6173862218856812,
"learning_rate": 8.32442906904808e-05,
"loss": 0.3776,
"step": 9180
},
{
"epoch": 2.709316037735849,
"grad_norm": 0.5742660760879517,
"learning_rate": 8.320958390762873e-05,
"loss": 0.3661,
"step": 9190
},
{
"epoch": 2.7122641509433962,
"grad_norm": 0.6058078408241272,
"learning_rate": 8.3174848468752e-05,
"loss": 0.3976,
"step": 9200
},
{
"epoch": 2.7152122641509435,
"grad_norm": 0.7256009578704834,
"learning_rate": 8.314008440382332e-05,
"loss": 0.3897,
"step": 9210
},
{
"epoch": 2.7181603773584904,
"grad_norm": 0.6028457880020142,
"learning_rate": 8.310529174284004e-05,
"loss": 0.4063,
"step": 9220
},
{
"epoch": 2.7211084905660377,
"grad_norm": 0.9602671265602112,
"learning_rate": 8.307047051582415e-05,
"loss": 0.3776,
"step": 9230
},
{
"epoch": 2.724056603773585,
"grad_norm": 0.6980867981910706,
"learning_rate": 8.303562075282239e-05,
"loss": 0.4265,
"step": 9240
},
{
"epoch": 2.7270047169811322,
"grad_norm": 0.7040355801582336,
"learning_rate": 8.300074248390603e-05,
"loss": 0.4002,
"step": 9250
},
{
"epoch": 2.7299528301886795,
"grad_norm": 0.6856332421302795,
"learning_rate": 8.2965835739171e-05,
"loss": 0.3941,
"step": 9260
},
{
"epoch": 2.7329009433962264,
"grad_norm": 0.5288712978363037,
"learning_rate": 8.293090054873777e-05,
"loss": 0.3988,
"step": 9270
},
{
"epoch": 2.7358490566037736,
"grad_norm": 0.6329268217086792,
"learning_rate": 8.289593694275138e-05,
"loss": 0.3914,
"step": 9280
},
{
"epoch": 2.7387971698113205,
"grad_norm": 0.6526012420654297,
"learning_rate": 8.286094495138136e-05,
"loss": 0.4155,
"step": 9290
},
{
"epoch": 2.7417452830188678,
"grad_norm": 0.7313765287399292,
"learning_rate": 8.282592460482174e-05,
"loss": 0.4254,
"step": 9300
},
{
"epoch": 2.744693396226415,
"grad_norm": 0.6954824924468994,
"learning_rate": 8.279087593329103e-05,
"loss": 0.3799,
"step": 9310
},
{
"epoch": 2.7476415094339623,
"grad_norm": 0.640709638595581,
"learning_rate": 8.275579896703216e-05,
"loss": 0.4147,
"step": 9320
},
{
"epoch": 2.7505896226415096,
"grad_norm": 0.5725107789039612,
"learning_rate": 8.27206937363125e-05,
"loss": 0.4085,
"step": 9330
},
{
"epoch": 2.7535377358490565,
"grad_norm": 0.6550928354263306,
"learning_rate": 8.268556027142382e-05,
"loss": 0.4063,
"step": 9340
},
{
"epoch": 2.7564858490566038,
"grad_norm": 0.6830494403839111,
"learning_rate": 8.26503986026822e-05,
"loss": 0.4158,
"step": 9350
},
{
"epoch": 2.759433962264151,
"grad_norm": 0.6744377613067627,
"learning_rate": 8.26152087604281e-05,
"loss": 0.3872,
"step": 9360
},
{
"epoch": 2.762382075471698,
"grad_norm": 0.6990178227424622,
"learning_rate": 8.257999077502627e-05,
"loss": 0.3824,
"step": 9370
},
{
"epoch": 2.765330188679245,
"grad_norm": 0.5335264205932617,
"learning_rate": 8.254474467686577e-05,
"loss": 0.3918,
"step": 9380
},
{
"epoch": 2.7682783018867925,
"grad_norm": 0.6216477751731873,
"learning_rate": 8.250947049635988e-05,
"loss": 0.4068,
"step": 9390
},
{
"epoch": 2.7712264150943398,
"grad_norm": 0.7340528964996338,
"learning_rate": 8.247416826394615e-05,
"loss": 0.3614,
"step": 9400
},
{
"epoch": 2.774174528301887,
"grad_norm": 0.966379702091217,
"learning_rate": 8.243883801008632e-05,
"loss": 0.4138,
"step": 9410
},
{
"epoch": 2.777122641509434,
"grad_norm": 0.7760789394378662,
"learning_rate": 8.240347976526628e-05,
"loss": 0.4127,
"step": 9420
},
{
"epoch": 2.780070754716981,
"grad_norm": 0.673620343208313,
"learning_rate": 8.236809355999612e-05,
"loss": 0.403,
"step": 9430
},
{
"epoch": 2.7830188679245285,
"grad_norm": 0.6146959066390991,
"learning_rate": 8.233267942481004e-05,
"loss": 0.3982,
"step": 9440
},
{
"epoch": 2.7859669811320753,
"grad_norm": 0.5682289004325867,
"learning_rate": 8.229723739026634e-05,
"loss": 0.3973,
"step": 9450
},
{
"epoch": 2.7889150943396226,
"grad_norm": 0.6069419384002686,
"learning_rate": 8.226176748694736e-05,
"loss": 0.3934,
"step": 9460
},
{
"epoch": 2.79186320754717,
"grad_norm": 0.6468014121055603,
"learning_rate": 8.222626974545955e-05,
"loss": 0.41,
"step": 9470
},
{
"epoch": 2.794811320754717,
"grad_norm": 0.5781310200691223,
"learning_rate": 8.219074419643334e-05,
"loss": 0.407,
"step": 9480
},
{
"epoch": 2.797759433962264,
"grad_norm": 0.4977602958679199,
"learning_rate": 8.215519087052316e-05,
"loss": 0.378,
"step": 9490
},
{
"epoch": 2.8007075471698113,
"grad_norm": 0.5155404210090637,
"learning_rate": 8.211960979840744e-05,
"loss": 0.3767,
"step": 9500
},
{
"epoch": 2.8036556603773586,
"grad_norm": 0.6690905690193176,
"learning_rate": 8.208400101078848e-05,
"loss": 0.3854,
"step": 9510
},
{
"epoch": 2.8066037735849054,
"grad_norm": 0.6241164207458496,
"learning_rate": 8.204836453839258e-05,
"loss": 0.3928,
"step": 9520
},
{
"epoch": 2.8095518867924527,
"grad_norm": 0.5780124068260193,
"learning_rate": 8.201270041196985e-05,
"loss": 0.4047,
"step": 9530
},
{
"epoch": 2.8125,
"grad_norm": 0.5171002745628357,
"learning_rate": 8.197700866229433e-05,
"loss": 0.3833,
"step": 9540
},
{
"epoch": 2.8154481132075473,
"grad_norm": 0.5561649799346924,
"learning_rate": 8.194128932016385e-05,
"loss": 0.3834,
"step": 9550
},
{
"epoch": 2.8183962264150946,
"grad_norm": 0.5444555282592773,
"learning_rate": 8.190554241640008e-05,
"loss": 0.4092,
"step": 9560
},
{
"epoch": 2.8213443396226414,
"grad_norm": 0.6935590505599976,
"learning_rate": 8.186976798184844e-05,
"loss": 0.39,
"step": 9570
},
{
"epoch": 2.8242924528301887,
"grad_norm": 0.6038203835487366,
"learning_rate": 8.183396604737815e-05,
"loss": 0.3667,
"step": 9580
},
{
"epoch": 2.827240566037736,
"grad_norm": 0.5164621472358704,
"learning_rate": 8.17981366438821e-05,
"loss": 0.3961,
"step": 9590
},
{
"epoch": 2.830188679245283,
"grad_norm": 0.5800121426582336,
"learning_rate": 8.176227980227694e-05,
"loss": 0.4047,
"step": 9600
},
{
"epoch": 2.83313679245283,
"grad_norm": 0.6471472382545471,
"learning_rate": 8.172639555350294e-05,
"loss": 0.3948,
"step": 9610
},
{
"epoch": 2.8360849056603774,
"grad_norm": 0.5783464312553406,
"learning_rate": 8.16904839285241e-05,
"loss": 0.4158,
"step": 9620
},
{
"epoch": 2.8390330188679247,
"grad_norm": 0.6344701647758484,
"learning_rate": 8.165454495832796e-05,
"loss": 0.3942,
"step": 9630
},
{
"epoch": 2.8419811320754715,
"grad_norm": 0.7728832364082336,
"learning_rate": 8.16185786739257e-05,
"loss": 0.3738,
"step": 9640
},
{
"epoch": 2.844929245283019,
"grad_norm": 0.6444641947746277,
"learning_rate": 8.158258510635205e-05,
"loss": 0.3791,
"step": 9650
},
{
"epoch": 2.847877358490566,
"grad_norm": 0.6528775095939636,
"learning_rate": 8.15465642866653e-05,
"loss": 0.3904,
"step": 9660
},
{
"epoch": 2.850825471698113,
"grad_norm": 0.6341770887374878,
"learning_rate": 8.151051624594723e-05,
"loss": 0.3991,
"step": 9670
},
{
"epoch": 2.8537735849056602,
"grad_norm": 0.5475857257843018,
"learning_rate": 8.147444101530313e-05,
"loss": 0.3934,
"step": 9680
},
{
"epoch": 2.8567216981132075,
"grad_norm": 0.5738356113433838,
"learning_rate": 8.143833862586177e-05,
"loss": 0.4014,
"step": 9690
},
{
"epoch": 2.859669811320755,
"grad_norm": 0.5165804624557495,
"learning_rate": 8.140220910877529e-05,
"loss": 0.4069,
"step": 9700
},
{
"epoch": 2.862617924528302,
"grad_norm": 0.5567706823348999,
"learning_rate": 8.136605249521929e-05,
"loss": 0.4064,
"step": 9710
},
{
"epoch": 2.865566037735849,
"grad_norm": 0.6238338947296143,
"learning_rate": 8.132986881639278e-05,
"loss": 0.3914,
"step": 9720
},
{
"epoch": 2.8685141509433962,
"grad_norm": 0.6118544340133667,
"learning_rate": 8.129365810351802e-05,
"loss": 0.3637,
"step": 9730
},
{
"epoch": 2.8714622641509435,
"grad_norm": 0.5087597966194153,
"learning_rate": 8.125742038784072e-05,
"loss": 0.3674,
"step": 9740
},
{
"epoch": 2.8744103773584904,
"grad_norm": 0.6550133228302002,
"learning_rate": 8.122115570062978e-05,
"loss": 0.3682,
"step": 9750
},
{
"epoch": 2.8773584905660377,
"grad_norm": 0.5862494707107544,
"learning_rate": 8.118486407317747e-05,
"loss": 0.3821,
"step": 9760
},
{
"epoch": 2.880306603773585,
"grad_norm": 0.5644306540489197,
"learning_rate": 8.114854553679925e-05,
"loss": 0.3916,
"step": 9770
},
{
"epoch": 2.8832547169811322,
"grad_norm": 0.51007080078125,
"learning_rate": 8.111220012283378e-05,
"loss": 0.396,
"step": 9780
},
{
"epoch": 2.8862028301886795,
"grad_norm": 0.5808833241462708,
"learning_rate": 8.107582786264299e-05,
"loss": 0.3822,
"step": 9790
},
{
"epoch": 2.8891509433962264,
"grad_norm": 0.5200833678245544,
"learning_rate": 8.103942878761188e-05,
"loss": 0.4109,
"step": 9800
},
{
"epoch": 2.8920990566037736,
"grad_norm": 0.7647379636764526,
"learning_rate": 8.100300292914866e-05,
"loss": 0.396,
"step": 9810
},
{
"epoch": 2.8950471698113205,
"grad_norm": 0.6947141885757446,
"learning_rate": 8.096655031868464e-05,
"loss": 0.3933,
"step": 9820
},
{
"epoch": 2.8979952830188678,
"grad_norm": 0.8077340126037598,
"learning_rate": 8.093007098767418e-05,
"loss": 0.4129,
"step": 9830
},
{
"epoch": 2.900943396226415,
"grad_norm": 0.55204176902771,
"learning_rate": 8.089356496759472e-05,
"loss": 0.4278,
"step": 9840
},
{
"epoch": 2.9038915094339623,
"grad_norm": 0.7977062463760376,
"learning_rate": 8.085703228994675e-05,
"loss": 0.3946,
"step": 9850
},
{
"epoch": 2.9068396226415096,
"grad_norm": 0.6182461977005005,
"learning_rate": 8.082047298625371e-05,
"loss": 0.4109,
"step": 9860
},
{
"epoch": 2.9097877358490565,
"grad_norm": 0.5555739998817444,
"learning_rate": 8.078388708806204e-05,
"loss": 0.4163,
"step": 9870
},
{
"epoch": 2.9127358490566038,
"grad_norm": 0.5553911924362183,
"learning_rate": 8.074727462694117e-05,
"loss": 0.3657,
"step": 9880
},
{
"epoch": 2.915683962264151,
"grad_norm": 0.5570409893989563,
"learning_rate": 8.07106356344834e-05,
"loss": 0.408,
"step": 9890
},
{
"epoch": 2.918632075471698,
"grad_norm": 0.5666655898094177,
"learning_rate": 8.067397014230391e-05,
"loss": 0.4121,
"step": 9900
},
{
"epoch": 2.921580188679245,
"grad_norm": 0.5638427138328552,
"learning_rate": 8.06372781820408e-05,
"loss": 0.3968,
"step": 9910
},
{
"epoch": 2.9245283018867925,
"grad_norm": 0.5179740786552429,
"learning_rate": 8.060055978535499e-05,
"loss": 0.403,
"step": 9920
},
{
"epoch": 2.9274764150943398,
"grad_norm": 0.5709393620491028,
"learning_rate": 8.056381498393018e-05,
"loss": 0.4088,
"step": 9930
},
{
"epoch": 2.930424528301887,
"grad_norm": 0.6752002239227295,
"learning_rate": 8.052704380947289e-05,
"loss": 0.3844,
"step": 9940
},
{
"epoch": 2.933372641509434,
"grad_norm": 0.6012061834335327,
"learning_rate": 8.049024629371238e-05,
"loss": 0.4015,
"step": 9950
},
{
"epoch": 2.936320754716981,
"grad_norm": 0.6851065754890442,
"learning_rate": 8.045342246840065e-05,
"loss": 0.3987,
"step": 9960
},
{
"epoch": 2.9392688679245285,
"grad_norm": 0.6459661722183228,
"learning_rate": 8.041657236531237e-05,
"loss": 0.3915,
"step": 9970
},
{
"epoch": 2.9422169811320753,
"grad_norm": 0.5662359595298767,
"learning_rate": 8.037969601624495e-05,
"loss": 0.3805,
"step": 9980
},
{
"epoch": 2.9451650943396226,
"grad_norm": 0.5004839301109314,
"learning_rate": 8.03427934530184e-05,
"loss": 0.3981,
"step": 9990
},
{
"epoch": 2.94811320754717,
"grad_norm": 0.5871425271034241,
"learning_rate": 8.030586470747534e-05,
"loss": 0.4006,
"step": 10000
},
{
"epoch": 2.94811320754717,
"eval_runtime": 2142.2792,
"eval_samples_per_second": 4.223,
"eval_steps_per_second": 0.528,
"step": 10000
},
{
"epoch": 2.951061320754717,
"grad_norm": 0.5965262055397034,
"learning_rate": 8.026890981148101e-05,
"loss": 0.3893,
"step": 10010
},
{
"epoch": 2.954009433962264,
"grad_norm": 0.529233455657959,
"learning_rate": 8.02319287969232e-05,
"loss": 0.3882,
"step": 10020
},
{
"epoch": 2.9569575471698113,
"grad_norm": 0.5882250070571899,
"learning_rate": 8.019492169571226e-05,
"loss": 0.3768,
"step": 10030
},
{
"epoch": 2.9599056603773586,
"grad_norm": 0.6170284748077393,
"learning_rate": 8.015788853978103e-05,
"loss": 0.4015,
"step": 10040
},
{
"epoch": 2.9628537735849054,
"grad_norm": 0.7063453793525696,
"learning_rate": 8.01208293610848e-05,
"loss": 0.3987,
"step": 10050
},
{
"epoch": 2.9658018867924527,
"grad_norm": 0.705338716506958,
"learning_rate": 8.008374419160138e-05,
"loss": 0.4152,
"step": 10060
},
{
"epoch": 2.96875,
"grad_norm": 0.9048996567726135,
"learning_rate": 8.004663306333098e-05,
"loss": 0.3833,
"step": 10070
},
{
"epoch": 2.9716981132075473,
"grad_norm": 0.732462465763092,
"learning_rate": 8.000949600829619e-05,
"loss": 0.394,
"step": 10080
},
{
"epoch": 2.9746462264150946,
"grad_norm": 0.7558377385139465,
"learning_rate": 7.9972333058542e-05,
"loss": 0.4012,
"step": 10090
},
{
"epoch": 2.9775943396226414,
"grad_norm": 0.5889543294906616,
"learning_rate": 7.99351442461357e-05,
"loss": 0.3817,
"step": 10100
},
{
"epoch": 2.9805424528301887,
"grad_norm": 0.7234789133071899,
"learning_rate": 7.989792960316697e-05,
"loss": 0.3752,
"step": 10110
},
{
"epoch": 2.983490566037736,
"grad_norm": 0.774864673614502,
"learning_rate": 7.986068916174774e-05,
"loss": 0.4001,
"step": 10120
},
{
"epoch": 2.986438679245283,
"grad_norm": 0.6571406722068787,
"learning_rate": 7.982342295401214e-05,
"loss": 0.3862,
"step": 10130
},
{
"epoch": 2.98938679245283,
"grad_norm": 0.616178572177887,
"learning_rate": 7.978613101211665e-05,
"loss": 0.373,
"step": 10140
},
{
"epoch": 2.9923349056603774,
"grad_norm": 0.6046925783157349,
"learning_rate": 7.974881336823988e-05,
"loss": 0.3966,
"step": 10150
},
{
"epoch": 2.9952830188679247,
"grad_norm": 0.607414186000824,
"learning_rate": 7.971147005458262e-05,
"loss": 0.4033,
"step": 10160
},
{
"epoch": 2.9982311320754715,
"grad_norm": 0.7482629418373108,
"learning_rate": 7.967410110336782e-05,
"loss": 0.4193,
"step": 10170
},
{
"epoch": 3.001179245283019,
"grad_norm": 0.5863407850265503,
"learning_rate": 7.963670654684059e-05,
"loss": 0.3624,
"step": 10180
},
{
"epoch": 3.004127358490566,
"grad_norm": 0.9249994158744812,
"learning_rate": 7.959928641726807e-05,
"loss": 0.3977,
"step": 10190
},
{
"epoch": 3.0070754716981134,
"grad_norm": 0.6155794262886047,
"learning_rate": 7.956184074693951e-05,
"loss": 0.4053,
"step": 10200
},
{
"epoch": 3.0100235849056602,
"grad_norm": 0.6526456475257874,
"learning_rate": 7.95243695681662e-05,
"loss": 0.3883,
"step": 10210
},
{
"epoch": 3.0129716981132075,
"grad_norm": 0.7357699275016785,
"learning_rate": 7.94868729132814e-05,
"loss": 0.4016,
"step": 10220
},
{
"epoch": 3.015919811320755,
"grad_norm": 0.5800666213035583,
"learning_rate": 7.94493508146404e-05,
"loss": 0.3967,
"step": 10230
},
{
"epoch": 3.018867924528302,
"grad_norm": 0.6288039088249207,
"learning_rate": 7.941180330462043e-05,
"loss": 0.3834,
"step": 10240
},
{
"epoch": 3.021816037735849,
"grad_norm": 0.5682847499847412,
"learning_rate": 7.937423041562061e-05,
"loss": 0.3964,
"step": 10250
},
{
"epoch": 3.0247641509433962,
"grad_norm": 0.48693859577178955,
"learning_rate": 7.933663218006204e-05,
"loss": 0.375,
"step": 10260
},
{
"epoch": 3.0277122641509435,
"grad_norm": 0.6615593433380127,
"learning_rate": 7.92990086303876e-05,
"loss": 0.3816,
"step": 10270
},
{
"epoch": 3.0306603773584904,
"grad_norm": 0.6491215229034424,
"learning_rate": 7.926135979906207e-05,
"loss": 0.4187,
"step": 10280
},
{
"epoch": 3.0336084905660377,
"grad_norm": 0.47101473808288574,
"learning_rate": 7.922368571857205e-05,
"loss": 0.3903,
"step": 10290
},
{
"epoch": 3.036556603773585,
"grad_norm": 0.624442994594574,
"learning_rate": 7.918598642142587e-05,
"loss": 0.4134,
"step": 10300
},
{
"epoch": 3.0395047169811322,
"grad_norm": 1.0118088722229004,
"learning_rate": 7.91482619401537e-05,
"loss": 0.4032,
"step": 10310
},
{
"epoch": 3.042452830188679,
"grad_norm": 0.7681689858436584,
"learning_rate": 7.911051230730737e-05,
"loss": 0.3643,
"step": 10320
},
{
"epoch": 3.0454009433962264,
"grad_norm": 0.49244460463523865,
"learning_rate": 7.907273755546044e-05,
"loss": 0.3954,
"step": 10330
},
{
"epoch": 3.0483490566037736,
"grad_norm": 0.6396487355232239,
"learning_rate": 7.903493771720815e-05,
"loss": 0.3953,
"step": 10340
},
{
"epoch": 3.051297169811321,
"grad_norm": 0.5380109548568726,
"learning_rate": 7.89971128251674e-05,
"loss": 0.3755,
"step": 10350
},
{
"epoch": 3.0542452830188678,
"grad_norm": 0.6882004737854004,
"learning_rate": 7.895926291197667e-05,
"loss": 0.372,
"step": 10360
},
{
"epoch": 3.057193396226415,
"grad_norm": 0.9137771725654602,
"learning_rate": 7.892138801029607e-05,
"loss": 0.4113,
"step": 10370
},
{
"epoch": 3.0601415094339623,
"grad_norm": 0.548916220664978,
"learning_rate": 7.888348815280724e-05,
"loss": 0.3958,
"step": 10380
},
{
"epoch": 3.0630896226415096,
"grad_norm": 0.5533170104026794,
"learning_rate": 7.884556337221336e-05,
"loss": 0.3796,
"step": 10390
},
{
"epoch": 3.0660377358490565,
"grad_norm": 0.7158203721046448,
"learning_rate": 7.880761370123913e-05,
"loss": 0.4026,
"step": 10400
},
{
"epoch": 3.0689858490566038,
"grad_norm": 0.7281920313835144,
"learning_rate": 7.876963917263073e-05,
"loss": 0.3938,
"step": 10410
},
{
"epoch": 3.071933962264151,
"grad_norm": 0.5935388803482056,
"learning_rate": 7.873163981915579e-05,
"loss": 0.3768,
"step": 10420
},
{
"epoch": 3.074882075471698,
"grad_norm": 0.7776430249214172,
"learning_rate": 7.869361567360331e-05,
"loss": 0.402,
"step": 10430
},
{
"epoch": 3.077830188679245,
"grad_norm": 0.5724771022796631,
"learning_rate": 7.865556676878376e-05,
"loss": 0.3878,
"step": 10440
},
{
"epoch": 3.0807783018867925,
"grad_norm": 0.65288907289505,
"learning_rate": 7.861749313752894e-05,
"loss": 0.3928,
"step": 10450
},
{
"epoch": 3.0837264150943398,
"grad_norm": 0.5804994106292725,
"learning_rate": 7.857939481269195e-05,
"loss": 0.3955,
"step": 10460
},
{
"epoch": 3.0866745283018866,
"grad_norm": 0.8151591420173645,
"learning_rate": 7.854127182714725e-05,
"loss": 0.4112,
"step": 10470
},
{
"epoch": 3.089622641509434,
"grad_norm": 0.6092017292976379,
"learning_rate": 7.850312421379058e-05,
"loss": 0.3899,
"step": 10480
},
{
"epoch": 3.092570754716981,
"grad_norm": 0.6407593488693237,
"learning_rate": 7.846495200553888e-05,
"loss": 0.3983,
"step": 10490
},
{
"epoch": 3.0955188679245285,
"grad_norm": 0.5366164445877075,
"learning_rate": 7.842675523533037e-05,
"loss": 0.3812,
"step": 10500
},
{
"epoch": 3.0984669811320753,
"grad_norm": 0.6861183643341064,
"learning_rate": 7.838853393612444e-05,
"loss": 0.3978,
"step": 10510
},
{
"epoch": 3.1014150943396226,
"grad_norm": 0.7737077474594116,
"learning_rate": 7.835028814090162e-05,
"loss": 0.3844,
"step": 10520
},
{
"epoch": 3.10436320754717,
"grad_norm": 0.5593844056129456,
"learning_rate": 7.831201788266363e-05,
"loss": 0.3689,
"step": 10530
},
{
"epoch": 3.107311320754717,
"grad_norm": 0.6012823581695557,
"learning_rate": 7.827372319443324e-05,
"loss": 0.3917,
"step": 10540
},
{
"epoch": 3.110259433962264,
"grad_norm": 0.791448712348938,
"learning_rate": 7.823540410925435e-05,
"loss": 0.3801,
"step": 10550
},
{
"epoch": 3.1132075471698113,
"grad_norm": 0.537286102771759,
"learning_rate": 7.819706066019189e-05,
"loss": 0.379,
"step": 10560
},
{
"epoch": 3.1161556603773586,
"grad_norm": 0.7742512822151184,
"learning_rate": 7.815869288033182e-05,
"loss": 0.3773,
"step": 10570
},
{
"epoch": 3.119103773584906,
"grad_norm": 0.5583837032318115,
"learning_rate": 7.812030080278107e-05,
"loss": 0.3965,
"step": 10580
},
{
"epoch": 3.1220518867924527,
"grad_norm": 0.6526674032211304,
"learning_rate": 7.808188446066759e-05,
"loss": 0.3992,
"step": 10590
},
{
"epoch": 3.125,
"grad_norm": 0.4943353831768036,
"learning_rate": 7.80434438871402e-05,
"loss": 0.4046,
"step": 10600
},
{
"epoch": 3.1279481132075473,
"grad_norm": 0.5462976694107056,
"learning_rate": 7.80049791153687e-05,
"loss": 0.4042,
"step": 10610
},
{
"epoch": 3.1308962264150946,
"grad_norm": 0.6557654738426208,
"learning_rate": 7.796649017854369e-05,
"loss": 0.4128,
"step": 10620
},
{
"epoch": 3.1338443396226414,
"grad_norm": 0.7243868112564087,
"learning_rate": 7.792797710987672e-05,
"loss": 0.4002,
"step": 10630
},
{
"epoch": 3.1367924528301887,
"grad_norm": 0.5722725987434387,
"learning_rate": 7.788943994260004e-05,
"loss": 0.3618,
"step": 10640
},
{
"epoch": 3.139740566037736,
"grad_norm": 0.7565357685089111,
"learning_rate": 7.785087870996682e-05,
"loss": 0.4058,
"step": 10650
},
{
"epoch": 3.142688679245283,
"grad_norm": 0.5960872769355774,
"learning_rate": 7.781229344525089e-05,
"loss": 0.382,
"step": 10660
},
{
"epoch": 3.14563679245283,
"grad_norm": 0.5391949415206909,
"learning_rate": 7.77736841817469e-05,
"loss": 0.3701,
"step": 10670
},
{
"epoch": 3.1485849056603774,
"grad_norm": 0.5930646657943726,
"learning_rate": 7.773505095277016e-05,
"loss": 0.408,
"step": 10680
},
{
"epoch": 3.1515330188679247,
"grad_norm": 0.565026044845581,
"learning_rate": 7.769639379165667e-05,
"loss": 0.3986,
"step": 10690
},
{
"epoch": 3.1544811320754715,
"grad_norm": 0.5504519939422607,
"learning_rate": 7.76577127317631e-05,
"loss": 0.3873,
"step": 10700
},
{
"epoch": 3.157429245283019,
"grad_norm": 0.7875372171401978,
"learning_rate": 7.761900780646671e-05,
"loss": 0.4126,
"step": 10710
},
{
"epoch": 3.160377358490566,
"grad_norm": 0.6552594304084778,
"learning_rate": 7.758027904916537e-05,
"loss": 0.3909,
"step": 10720
},
{
"epoch": 3.1633254716981134,
"grad_norm": 0.6908440589904785,
"learning_rate": 7.754152649327748e-05,
"loss": 0.4028,
"step": 10730
},
{
"epoch": 3.1662735849056602,
"grad_norm": 0.7323873043060303,
"learning_rate": 7.750275017224207e-05,
"loss": 0.3911,
"step": 10740
},
{
"epoch": 3.1692216981132075,
"grad_norm": 0.6748160123825073,
"learning_rate": 7.746395011951857e-05,
"loss": 0.3907,
"step": 10750
},
{
"epoch": 3.172169811320755,
"grad_norm": 0.7665725350379944,
"learning_rate": 7.742512636858694e-05,
"loss": 0.4011,
"step": 10760
},
{
"epoch": 3.175117924528302,
"grad_norm": 0.7948001623153687,
"learning_rate": 7.738627895294761e-05,
"loss": 0.4092,
"step": 10770
},
{
"epoch": 3.178066037735849,
"grad_norm": 0.7065974473953247,
"learning_rate": 7.734740790612136e-05,
"loss": 0.3977,
"step": 10780
},
{
"epoch": 3.1810141509433962,
"grad_norm": 1.30756676197052,
"learning_rate": 7.730851326164941e-05,
"loss": 0.4076,
"step": 10790
},
{
"epoch": 3.1839622641509435,
"grad_norm": 0.6469963788986206,
"learning_rate": 7.726959505309334e-05,
"loss": 0.3677,
"step": 10800
},
{
"epoch": 3.1869103773584904,
"grad_norm": 0.7830958962440491,
"learning_rate": 7.723065331403506e-05,
"loss": 0.403,
"step": 10810
},
{
"epoch": 3.1898584905660377,
"grad_norm": 0.5901809930801392,
"learning_rate": 7.719168807807678e-05,
"loss": 0.3844,
"step": 10820
},
{
"epoch": 3.192806603773585,
"grad_norm": 0.5996023416519165,
"learning_rate": 7.715269937884097e-05,
"loss": 0.4295,
"step": 10830
},
{
"epoch": 3.1957547169811322,
"grad_norm": 0.625641942024231,
"learning_rate": 7.711368724997038e-05,
"loss": 0.4013,
"step": 10840
},
{
"epoch": 3.198702830188679,
"grad_norm": 0.5808256268501282,
"learning_rate": 7.707465172512797e-05,
"loss": 0.3835,
"step": 10850
},
{
"epoch": 3.2016509433962264,
"grad_norm": 0.558075487613678,
"learning_rate": 7.703559283799684e-05,
"loss": 0.3896,
"step": 10860
},
{
"epoch": 3.2045990566037736,
"grad_norm": 0.6329627633094788,
"learning_rate": 7.699651062228033e-05,
"loss": 0.4203,
"step": 10870
},
{
"epoch": 3.207547169811321,
"grad_norm": 0.6433579325675964,
"learning_rate": 7.695740511170182e-05,
"loss": 0.3926,
"step": 10880
},
{
"epoch": 3.2104952830188678,
"grad_norm": 0.5347244143486023,
"learning_rate": 7.691827634000487e-05,
"loss": 0.403,
"step": 10890
},
{
"epoch": 3.213443396226415,
"grad_norm": 0.5241761803627014,
"learning_rate": 7.687912434095305e-05,
"loss": 0.3743,
"step": 10900
},
{
"epoch": 3.2163915094339623,
"grad_norm": 0.6813592314720154,
"learning_rate": 7.683994914833004e-05,
"loss": 0.3869,
"step": 10910
},
{
"epoch": 3.2193396226415096,
"grad_norm": 0.6979883313179016,
"learning_rate": 7.680075079593947e-05,
"loss": 0.414,
"step": 10920
},
{
"epoch": 3.2222877358490565,
"grad_norm": 0.6808333992958069,
"learning_rate": 7.676152931760496e-05,
"loss": 0.3869,
"step": 10930
},
{
"epoch": 3.2252358490566038,
"grad_norm": 0.6486300230026245,
"learning_rate": 7.672228474717015e-05,
"loss": 0.3919,
"step": 10940
},
{
"epoch": 3.228183962264151,
"grad_norm": 0.6181318759918213,
"learning_rate": 7.668301711849851e-05,
"loss": 0.4227,
"step": 10950
},
{
"epoch": 3.231132075471698,
"grad_norm": 0.620484471321106,
"learning_rate": 7.664372646547349e-05,
"loss": 0.3853,
"step": 10960
},
{
"epoch": 3.234080188679245,
"grad_norm": 0.7112252712249756,
"learning_rate": 7.660441282199835e-05,
"loss": 0.4017,
"step": 10970
},
{
"epoch": 3.2370283018867925,
"grad_norm": 0.565260112285614,
"learning_rate": 7.656507622199623e-05,
"loss": 0.3923,
"step": 10980
},
{
"epoch": 3.2399764150943398,
"grad_norm": 0.6450520157814026,
"learning_rate": 7.652571669941005e-05,
"loss": 0.373,
"step": 10990
},
{
"epoch": 3.2429245283018866,
"grad_norm": 0.8540322184562683,
"learning_rate": 7.648633428820253e-05,
"loss": 0.3697,
"step": 11000
},
{
"epoch": 3.2429245283018866,
"eval_runtime": 2152.7202,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.525,
"step": 11000
},
{
"epoch": 3.245872641509434,
"grad_norm": 0.6468019485473633,
"learning_rate": 7.644692902235611e-05,
"loss": 0.3941,
"step": 11010
},
{
"epoch": 3.248820754716981,
"grad_norm": 0.6206846237182617,
"learning_rate": 7.640750093587298e-05,
"loss": 0.389,
"step": 11020
},
{
"epoch": 3.2517688679245285,
"grad_norm": 0.6426883339881897,
"learning_rate": 7.636805006277501e-05,
"loss": 0.4031,
"step": 11030
},
{
"epoch": 3.2547169811320753,
"grad_norm": 0.7230492234230042,
"learning_rate": 7.632857643710374e-05,
"loss": 0.4156,
"step": 11040
},
{
"epoch": 3.2576650943396226,
"grad_norm": 0.643689751625061,
"learning_rate": 7.628908009292035e-05,
"loss": 0.3678,
"step": 11050
},
{
"epoch": 3.26061320754717,
"grad_norm": 0.6697520017623901,
"learning_rate": 7.624956106430559e-05,
"loss": 0.4001,
"step": 11060
},
{
"epoch": 3.263561320754717,
"grad_norm": 0.535620927810669,
"learning_rate": 7.621001938535979e-05,
"loss": 0.3945,
"step": 11070
},
{
"epoch": 3.266509433962264,
"grad_norm": 0.6506339311599731,
"learning_rate": 7.617045509020289e-05,
"loss": 0.4008,
"step": 11080
},
{
"epoch": 3.2694575471698113,
"grad_norm": 0.7122704982757568,
"learning_rate": 7.613086821297424e-05,
"loss": 0.3855,
"step": 11090
},
{
"epoch": 3.2724056603773586,
"grad_norm": 0.5913416743278503,
"learning_rate": 7.609125878783277e-05,
"loss": 0.3823,
"step": 11100
},
{
"epoch": 3.2753537735849054,
"grad_norm": 0.5282361507415771,
"learning_rate": 7.60516268489568e-05,
"loss": 0.3969,
"step": 11110
},
{
"epoch": 3.2783018867924527,
"grad_norm": 0.6230885982513428,
"learning_rate": 7.60119724305441e-05,
"loss": 0.3699,
"step": 11120
},
{
"epoch": 3.28125,
"grad_norm": 0.7127173542976379,
"learning_rate": 7.59722955668119e-05,
"loss": 0.3727,
"step": 11130
},
{
"epoch": 3.2841981132075473,
"grad_norm": 0.5167060494422913,
"learning_rate": 7.593259629199665e-05,
"loss": 0.3861,
"step": 11140
},
{
"epoch": 3.2871462264150946,
"grad_norm": 0.732917308807373,
"learning_rate": 7.589287464035428e-05,
"loss": 0.395,
"step": 11150
},
{
"epoch": 3.2900943396226414,
"grad_norm": 1.1460092067718506,
"learning_rate": 7.585313064615998e-05,
"loss": 0.3991,
"step": 11160
},
{
"epoch": 3.2930424528301887,
"grad_norm": 0.48620718717575073,
"learning_rate": 7.581336434370817e-05,
"loss": 0.4097,
"step": 11170
},
{
"epoch": 3.295990566037736,
"grad_norm": 1.070408821105957,
"learning_rate": 7.57735757673126e-05,
"loss": 0.3865,
"step": 11180
},
{
"epoch": 3.298938679245283,
"grad_norm": 1.1448793411254883,
"learning_rate": 7.57337649513062e-05,
"loss": 0.3837,
"step": 11190
},
{
"epoch": 3.30188679245283,
"grad_norm": 0.6778339147567749,
"learning_rate": 7.569393193004108e-05,
"loss": 0.3719,
"step": 11200
},
{
"epoch": 3.3048349056603774,
"grad_norm": 0.6612284779548645,
"learning_rate": 7.565407673788855e-05,
"loss": 0.4058,
"step": 11210
},
{
"epoch": 3.3077830188679247,
"grad_norm": 0.5330166816711426,
"learning_rate": 7.561419940923898e-05,
"loss": 0.374,
"step": 11220
},
{
"epoch": 3.3107311320754715,
"grad_norm": 0.6324144601821899,
"learning_rate": 7.557429997850192e-05,
"loss": 0.3867,
"step": 11230
},
{
"epoch": 3.313679245283019,
"grad_norm": 0.6218940019607544,
"learning_rate": 7.553437848010597e-05,
"loss": 0.406,
"step": 11240
},
{
"epoch": 3.316627358490566,
"grad_norm": 0.656190037727356,
"learning_rate": 7.549443494849872e-05,
"loss": 0.3841,
"step": 11250
},
{
"epoch": 3.3195754716981134,
"grad_norm": 0.5203183889389038,
"learning_rate": 7.545446941814682e-05,
"loss": 0.3934,
"step": 11260
},
{
"epoch": 3.3225235849056602,
"grad_norm": 0.6663714647293091,
"learning_rate": 7.541448192353593e-05,
"loss": 0.401,
"step": 11270
},
{
"epoch": 3.3254716981132075,
"grad_norm": 0.5393223762512207,
"learning_rate": 7.53744724991706e-05,
"loss": 0.3712,
"step": 11280
},
{
"epoch": 3.328419811320755,
"grad_norm": 0.7120815515518188,
"learning_rate": 7.533444117957433e-05,
"loss": 0.394,
"step": 11290
},
{
"epoch": 3.331367924528302,
"grad_norm": 0.5879067182540894,
"learning_rate": 7.529438799928949e-05,
"loss": 0.3824,
"step": 11300
},
{
"epoch": 3.334316037735849,
"grad_norm": 0.7203354835510254,
"learning_rate": 7.525431299287738e-05,
"loss": 0.3997,
"step": 11310
},
{
"epoch": 3.3372641509433962,
"grad_norm": 0.6599690318107605,
"learning_rate": 7.521421619491806e-05,
"loss": 0.3935,
"step": 11320
},
{
"epoch": 3.3402122641509435,
"grad_norm": 0.8253904581069946,
"learning_rate": 7.517409764001043e-05,
"loss": 0.3774,
"step": 11330
},
{
"epoch": 3.3431603773584904,
"grad_norm": 0.618156909942627,
"learning_rate": 7.513395736277216e-05,
"loss": 0.4073,
"step": 11340
},
{
"epoch": 3.3461084905660377,
"grad_norm": 0.7392096519470215,
"learning_rate": 7.509379539783965e-05,
"loss": 0.3851,
"step": 11350
},
{
"epoch": 3.349056603773585,
"grad_norm": 0.6047782897949219,
"learning_rate": 7.505361177986803e-05,
"loss": 0.3737,
"step": 11360
},
{
"epoch": 3.3520047169811322,
"grad_norm": 0.6954718828201294,
"learning_rate": 7.501340654353113e-05,
"loss": 0.4062,
"step": 11370
},
{
"epoch": 3.354952830188679,
"grad_norm": 0.7750207781791687,
"learning_rate": 7.497317972352139e-05,
"loss": 0.3685,
"step": 11380
},
{
"epoch": 3.3579009433962264,
"grad_norm": 0.5615482926368713,
"learning_rate": 7.493293135454987e-05,
"loss": 0.3877,
"step": 11390
},
{
"epoch": 3.3608490566037736,
"grad_norm": 0.6940181255340576,
"learning_rate": 7.489266147134631e-05,
"loss": 0.3801,
"step": 11400
},
{
"epoch": 3.363797169811321,
"grad_norm": 0.6563462018966675,
"learning_rate": 7.485237010865891e-05,
"loss": 0.3835,
"step": 11410
},
{
"epoch": 3.3667452830188678,
"grad_norm": 0.6996760368347168,
"learning_rate": 7.481205730125447e-05,
"loss": 0.3671,
"step": 11420
},
{
"epoch": 3.369693396226415,
"grad_norm": 0.7388498187065125,
"learning_rate": 7.477172308391828e-05,
"loss": 0.3979,
"step": 11430
},
{
"epoch": 3.3726415094339623,
"grad_norm": 0.5929543375968933,
"learning_rate": 7.473136749145407e-05,
"loss": 0.3955,
"step": 11440
},
{
"epoch": 3.3755896226415096,
"grad_norm": 0.5443402528762817,
"learning_rate": 7.469099055868406e-05,
"loss": 0.3744,
"step": 11450
},
{
"epoch": 3.3785377358490565,
"grad_norm": 0.5640516877174377,
"learning_rate": 7.465059232044887e-05,
"loss": 0.3962,
"step": 11460
},
{
"epoch": 3.3814858490566038,
"grad_norm": 0.6397266983985901,
"learning_rate": 7.46101728116075e-05,
"loss": 0.3738,
"step": 11470
},
{
"epoch": 3.384433962264151,
"grad_norm": 0.7406302094459534,
"learning_rate": 7.456973206703732e-05,
"loss": 0.3619,
"step": 11480
},
{
"epoch": 3.387382075471698,
"grad_norm": 0.6106283664703369,
"learning_rate": 7.452927012163395e-05,
"loss": 0.3863,
"step": 11490
},
{
"epoch": 3.390330188679245,
"grad_norm": 0.7511622905731201,
"learning_rate": 7.448878701031142e-05,
"loss": 0.3918,
"step": 11500
},
{
"epoch": 3.3932783018867925,
"grad_norm": 0.7325133681297302,
"learning_rate": 7.444828276800196e-05,
"loss": 0.3838,
"step": 11510
},
{
"epoch": 3.3962264150943398,
"grad_norm": 0.5728434324264526,
"learning_rate": 7.440775742965602e-05,
"loss": 0.3832,
"step": 11520
},
{
"epoch": 3.3991745283018866,
"grad_norm": 0.613081157207489,
"learning_rate": 7.436721103024227e-05,
"loss": 0.392,
"step": 11530
},
{
"epoch": 3.402122641509434,
"grad_norm": 0.7749249935150146,
"learning_rate": 7.432664360474759e-05,
"loss": 0.3884,
"step": 11540
},
{
"epoch": 3.405070754716981,
"grad_norm": 0.6577479839324951,
"learning_rate": 7.428605518817694e-05,
"loss": 0.4222,
"step": 11550
},
{
"epoch": 3.4080188679245285,
"grad_norm": 0.6232143640518188,
"learning_rate": 7.424544581555342e-05,
"loss": 0.3835,
"step": 11560
},
{
"epoch": 3.4109669811320753,
"grad_norm": 0.7117622494697571,
"learning_rate": 7.420481552191825e-05,
"loss": 0.3913,
"step": 11570
},
{
"epoch": 3.4139150943396226,
"grad_norm": 0.710422694683075,
"learning_rate": 7.416416434233063e-05,
"loss": 0.3654,
"step": 11580
},
{
"epoch": 3.41686320754717,
"grad_norm": 0.61385577917099,
"learning_rate": 7.412349231186784e-05,
"loss": 0.376,
"step": 11590
},
{
"epoch": 3.419811320754717,
"grad_norm": 0.6077531576156616,
"learning_rate": 7.408279946562512e-05,
"loss": 0.3836,
"step": 11600
},
{
"epoch": 3.422759433962264,
"grad_norm": 0.7580487132072449,
"learning_rate": 7.404208583871569e-05,
"loss": 0.4002,
"step": 11610
},
{
"epoch": 3.4257075471698113,
"grad_norm": 0.5125753283500671,
"learning_rate": 7.400135146627069e-05,
"loss": 0.3671,
"step": 11620
},
{
"epoch": 3.4286556603773586,
"grad_norm": 0.7817121148109436,
"learning_rate": 7.396059638343918e-05,
"loss": 0.3946,
"step": 11630
},
{
"epoch": 3.4316037735849054,
"grad_norm": 0.7812344431877136,
"learning_rate": 7.391982062538807e-05,
"loss": 0.3742,
"step": 11640
},
{
"epoch": 3.4345518867924527,
"grad_norm": 0.5848270058631897,
"learning_rate": 7.38790242273021e-05,
"loss": 0.3949,
"step": 11650
},
{
"epoch": 3.4375,
"grad_norm": 0.606198251247406,
"learning_rate": 7.383820722438386e-05,
"loss": 0.3918,
"step": 11660
},
{
"epoch": 3.4404481132075473,
"grad_norm": 0.6764916181564331,
"learning_rate": 7.379736965185368e-05,
"loss": 0.3943,
"step": 11670
},
{
"epoch": 3.4433962264150946,
"grad_norm": 0.5637912750244141,
"learning_rate": 7.375651154494967e-05,
"loss": 0.3705,
"step": 11680
},
{
"epoch": 3.4463443396226414,
"grad_norm": 0.6343154311180115,
"learning_rate": 7.371563293892761e-05,
"loss": 0.3886,
"step": 11690
},
{
"epoch": 3.4492924528301887,
"grad_norm": 0.6066147685050964,
"learning_rate": 7.367473386906105e-05,
"loss": 0.4318,
"step": 11700
},
{
"epoch": 3.452240566037736,
"grad_norm": 0.7721722722053528,
"learning_rate": 7.363381437064112e-05,
"loss": 0.4059,
"step": 11710
},
{
"epoch": 3.455188679245283,
"grad_norm": 0.7035283446311951,
"learning_rate": 7.359287447897661e-05,
"loss": 0.3695,
"step": 11720
},
{
"epoch": 3.45813679245283,
"grad_norm": 0.6725998520851135,
"learning_rate": 7.355191422939393e-05,
"loss": 0.3732,
"step": 11730
},
{
"epoch": 3.4610849056603774,
"grad_norm": 0.7684424519538879,
"learning_rate": 7.351093365723699e-05,
"loss": 0.396,
"step": 11740
},
{
"epoch": 3.4640330188679247,
"grad_norm": 0.5313234925270081,
"learning_rate": 7.346993279786732e-05,
"loss": 0.4071,
"step": 11750
},
{
"epoch": 3.4669811320754715,
"grad_norm": 0.6019145846366882,
"learning_rate": 7.342891168666388e-05,
"loss": 0.36,
"step": 11760
},
{
"epoch": 3.469929245283019,
"grad_norm": 0.5979236960411072,
"learning_rate": 7.338787035902314e-05,
"loss": 0.3606,
"step": 11770
},
{
"epoch": 3.472877358490566,
"grad_norm": 0.6868577003479004,
"learning_rate": 7.334680885035904e-05,
"loss": 0.397,
"step": 11780
},
{
"epoch": 3.4758254716981134,
"grad_norm": 0.6378858089447021,
"learning_rate": 7.330572719610289e-05,
"loss": 0.3771,
"step": 11790
},
{
"epoch": 3.4787735849056602,
"grad_norm": 0.6885668635368347,
"learning_rate": 7.326462543170338e-05,
"loss": 0.4092,
"step": 11800
},
{
"epoch": 3.4817216981132075,
"grad_norm": 0.5670936703681946,
"learning_rate": 7.322350359262662e-05,
"loss": 0.3738,
"step": 11810
},
{
"epoch": 3.484669811320755,
"grad_norm": 0.8134390115737915,
"learning_rate": 7.318236171435594e-05,
"loss": 0.4033,
"step": 11820
},
{
"epoch": 3.487617924528302,
"grad_norm": 0.6211479902267456,
"learning_rate": 7.314119983239204e-05,
"loss": 0.4148,
"step": 11830
},
{
"epoch": 3.490566037735849,
"grad_norm": 0.8247025609016418,
"learning_rate": 7.310001798225288e-05,
"loss": 0.4096,
"step": 11840
},
{
"epoch": 3.4935141509433962,
"grad_norm": 0.750121533870697,
"learning_rate": 7.305881619947359e-05,
"loss": 0.4268,
"step": 11850
},
{
"epoch": 3.4964622641509435,
"grad_norm": 1.0719488859176636,
"learning_rate": 7.301759451960657e-05,
"loss": 0.3988,
"step": 11860
},
{
"epoch": 3.4994103773584904,
"grad_norm": 0.6894809007644653,
"learning_rate": 7.297635297822132e-05,
"loss": 0.3954,
"step": 11870
},
{
"epoch": 3.5023584905660377,
"grad_norm": 1.2765004634857178,
"learning_rate": 7.293509161090452e-05,
"loss": 0.3844,
"step": 11880
},
{
"epoch": 3.505306603773585,
"grad_norm": 0.6348519325256348,
"learning_rate": 7.289381045325999e-05,
"loss": 0.386,
"step": 11890
},
{
"epoch": 3.5082547169811322,
"grad_norm": 0.606871485710144,
"learning_rate": 7.285250954090854e-05,
"loss": 0.3729,
"step": 11900
},
{
"epoch": 3.5112028301886795,
"grad_norm": 0.6608160734176636,
"learning_rate": 7.28111889094881e-05,
"loss": 0.3702,
"step": 11910
},
{
"epoch": 3.5141509433962264,
"grad_norm": 0.6897141933441162,
"learning_rate": 7.27698485946536e-05,
"loss": 0.3903,
"step": 11920
},
{
"epoch": 3.5170990566037736,
"grad_norm": 0.7128772735595703,
"learning_rate": 7.272848863207691e-05,
"loss": 0.3795,
"step": 11930
},
{
"epoch": 3.5200471698113205,
"grad_norm": 0.6260384321212769,
"learning_rate": 7.268710905744691e-05,
"loss": 0.3978,
"step": 11940
},
{
"epoch": 3.5229952830188678,
"grad_norm": 0.5759779214859009,
"learning_rate": 7.264570990646938e-05,
"loss": 0.3898,
"step": 11950
},
{
"epoch": 3.525943396226415,
"grad_norm": 0.5746546387672424,
"learning_rate": 7.260429121486698e-05,
"loss": 0.3709,
"step": 11960
},
{
"epoch": 3.5288915094339623,
"grad_norm": 0.6677197217941284,
"learning_rate": 7.256285301837927e-05,
"loss": 0.3654,
"step": 11970
},
{
"epoch": 3.5318396226415096,
"grad_norm": 0.5076614022254944,
"learning_rate": 7.252139535276256e-05,
"loss": 0.3847,
"step": 11980
},
{
"epoch": 3.5347877358490565,
"grad_norm": 0.6482424736022949,
"learning_rate": 7.247991825379007e-05,
"loss": 0.3505,
"step": 11990
},
{
"epoch": 3.5377358490566038,
"grad_norm": 0.6378607749938965,
"learning_rate": 7.243842175725172e-05,
"loss": 0.4128,
"step": 12000
},
{
"epoch": 3.5377358490566038,
"eval_runtime": 2152.0566,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.526,
"step": 12000
},
{
"epoch": 3.540683962264151,
"grad_norm": 0.5796979665756226,
"learning_rate": 7.239690589895416e-05,
"loss": 0.3843,
"step": 12010
},
{
"epoch": 3.543632075471698,
"grad_norm": 0.6449692249298096,
"learning_rate": 7.235537071472078e-05,
"loss": 0.3716,
"step": 12020
},
{
"epoch": 3.546580188679245,
"grad_norm": 0.6569247245788574,
"learning_rate": 7.231381624039164e-05,
"loss": 0.3915,
"step": 12030
},
{
"epoch": 3.5495283018867925,
"grad_norm": 0.5907280445098877,
"learning_rate": 7.227224251182342e-05,
"loss": 0.3648,
"step": 12040
},
{
"epoch": 3.5524764150943398,
"grad_norm": 0.6250777840614319,
"learning_rate": 7.223064956488946e-05,
"loss": 0.392,
"step": 12050
},
{
"epoch": 3.555424528301887,
"grad_norm": 0.6015505194664001,
"learning_rate": 7.218903743547964e-05,
"loss": 0.3705,
"step": 12060
},
{
"epoch": 3.558372641509434,
"grad_norm": 0.6118935346603394,
"learning_rate": 7.214740615950041e-05,
"loss": 0.3841,
"step": 12070
},
{
"epoch": 3.561320754716981,
"grad_norm": 0.5655686855316162,
"learning_rate": 7.210575577287473e-05,
"loss": 0.3726,
"step": 12080
},
{
"epoch": 3.5642688679245285,
"grad_norm": 0.6098667979240417,
"learning_rate": 7.206408631154207e-05,
"loss": 0.3775,
"step": 12090
},
{
"epoch": 3.5672169811320753,
"grad_norm": 0.5612145066261292,
"learning_rate": 7.202239781145834e-05,
"loss": 0.4044,
"step": 12100
},
{
"epoch": 3.5701650943396226,
"grad_norm": 0.6391407251358032,
"learning_rate": 7.198069030859591e-05,
"loss": 0.3744,
"step": 12110
},
{
"epoch": 3.57311320754717,
"grad_norm": 0.5467461347579956,
"learning_rate": 7.193896383894351e-05,
"loss": 0.3872,
"step": 12120
},
{
"epoch": 3.576061320754717,
"grad_norm": 0.5849887728691101,
"learning_rate": 7.189721843850624e-05,
"loss": 0.3996,
"step": 12130
},
{
"epoch": 3.579009433962264,
"grad_norm": 0.5455970764160156,
"learning_rate": 7.185545414330557e-05,
"loss": 0.3771,
"step": 12140
},
{
"epoch": 3.5819575471698113,
"grad_norm": 0.6030681729316711,
"learning_rate": 7.181367098937921e-05,
"loss": 0.3716,
"step": 12150
},
{
"epoch": 3.5849056603773586,
"grad_norm": 0.5632196068763733,
"learning_rate": 7.177186901278124e-05,
"loss": 0.3531,
"step": 12160
},
{
"epoch": 3.5878537735849054,
"grad_norm": 0.6945312023162842,
"learning_rate": 7.173004824958187e-05,
"loss": 0.3804,
"step": 12170
},
{
"epoch": 3.5908018867924527,
"grad_norm": 0.5595278739929199,
"learning_rate": 7.168820873586759e-05,
"loss": 0.3578,
"step": 12180
},
{
"epoch": 3.59375,
"grad_norm": 0.5647584199905396,
"learning_rate": 7.164635050774109e-05,
"loss": 0.376,
"step": 12190
},
{
"epoch": 3.5966981132075473,
"grad_norm": 0.589163601398468,
"learning_rate": 7.160447360132113e-05,
"loss": 0.4249,
"step": 12200
},
{
"epoch": 3.5996462264150946,
"grad_norm": 0.5941894054412842,
"learning_rate": 7.156257805274263e-05,
"loss": 0.39,
"step": 12210
},
{
"epoch": 3.6025943396226414,
"grad_norm": 0.7123454213142395,
"learning_rate": 7.152066389815663e-05,
"loss": 0.3745,
"step": 12220
},
{
"epoch": 3.6055424528301887,
"grad_norm": 0.5194860100746155,
"learning_rate": 7.147873117373016e-05,
"loss": 0.3778,
"step": 12230
},
{
"epoch": 3.608490566037736,
"grad_norm": 0.545570969581604,
"learning_rate": 7.143677991564632e-05,
"loss": 0.3582,
"step": 12240
},
{
"epoch": 3.611438679245283,
"grad_norm": 0.639003574848175,
"learning_rate": 7.139481016010419e-05,
"loss": 0.3762,
"step": 12250
},
{
"epoch": 3.61438679245283,
"grad_norm": 0.6492764353752136,
"learning_rate": 7.13528219433188e-05,
"loss": 0.3879,
"step": 12260
},
{
"epoch": 3.6173349056603774,
"grad_norm": 0.802506148815155,
"learning_rate": 7.131081530152111e-05,
"loss": 0.3954,
"step": 12270
},
{
"epoch": 3.6202830188679247,
"grad_norm": 0.5870131850242615,
"learning_rate": 7.126879027095802e-05,
"loss": 0.3751,
"step": 12280
},
{
"epoch": 3.6232311320754715,
"grad_norm": 0.574275553226471,
"learning_rate": 7.122674688789223e-05,
"loss": 0.3918,
"step": 12290
},
{
"epoch": 3.626179245283019,
"grad_norm": 0.6729950904846191,
"learning_rate": 7.118468518860232e-05,
"loss": 0.3934,
"step": 12300
},
{
"epoch": 3.629127358490566,
"grad_norm": 0.5776434540748596,
"learning_rate": 7.114260520938265e-05,
"loss": 0.3924,
"step": 12310
},
{
"epoch": 3.632075471698113,
"grad_norm": 0.5719219446182251,
"learning_rate": 7.11005069865434e-05,
"loss": 0.3747,
"step": 12320
},
{
"epoch": 3.6350235849056602,
"grad_norm": 0.6872524619102478,
"learning_rate": 7.105839055641045e-05,
"loss": 0.3862,
"step": 12330
},
{
"epoch": 3.6379716981132075,
"grad_norm": 0.592881441116333,
"learning_rate": 7.101625595532539e-05,
"loss": 0.3772,
"step": 12340
},
{
"epoch": 3.640919811320755,
"grad_norm": 0.607401967048645,
"learning_rate": 7.097410321964549e-05,
"loss": 0.3741,
"step": 12350
},
{
"epoch": 3.643867924528302,
"grad_norm": 0.6241199374198914,
"learning_rate": 7.093193238574372e-05,
"loss": 0.3903,
"step": 12360
},
{
"epoch": 3.646816037735849,
"grad_norm": 0.5976290106773376,
"learning_rate": 7.088974349000859e-05,
"loss": 0.382,
"step": 12370
},
{
"epoch": 3.6497641509433962,
"grad_norm": 0.6723179817199707,
"learning_rate": 7.084753656884424e-05,
"loss": 0.3767,
"step": 12380
},
{
"epoch": 3.6527122641509435,
"grad_norm": 0.712138295173645,
"learning_rate": 7.080531165867036e-05,
"loss": 0.4065,
"step": 12390
},
{
"epoch": 3.6556603773584904,
"grad_norm": 0.8522016406059265,
"learning_rate": 7.076306879592215e-05,
"loss": 0.3865,
"step": 12400
},
{
"epoch": 3.6586084905660377,
"grad_norm": 0.5308501720428467,
"learning_rate": 7.072080801705032e-05,
"loss": 0.3744,
"step": 12410
},
{
"epoch": 3.661556603773585,
"grad_norm": 0.7942071557044983,
"learning_rate": 7.067852935852102e-05,
"loss": 0.4065,
"step": 12420
},
{
"epoch": 3.6645047169811322,
"grad_norm": 0.5463528633117676,
"learning_rate": 7.063623285681583e-05,
"loss": 0.3788,
"step": 12430
},
{
"epoch": 3.6674528301886795,
"grad_norm": 0.5827257037162781,
"learning_rate": 7.059391854843175e-05,
"loss": 0.3909,
"step": 12440
},
{
"epoch": 3.6704009433962264,
"grad_norm": 0.650521457195282,
"learning_rate": 7.055158646988109e-05,
"loss": 0.3519,
"step": 12450
},
{
"epoch": 3.6733490566037736,
"grad_norm": 0.6249088048934937,
"learning_rate": 7.050923665769157e-05,
"loss": 0.3739,
"step": 12460
},
{
"epoch": 3.6762971698113205,
"grad_norm": 0.49112915992736816,
"learning_rate": 7.046686914840617e-05,
"loss": 0.3841,
"step": 12470
},
{
"epoch": 3.6792452830188678,
"grad_norm": 0.667676568031311,
"learning_rate": 7.042448397858311e-05,
"loss": 0.3961,
"step": 12480
},
{
"epoch": 3.682193396226415,
"grad_norm": 0.7432703375816345,
"learning_rate": 7.038208118479592e-05,
"loss": 0.3796,
"step": 12490
},
{
"epoch": 3.6851415094339623,
"grad_norm": 0.8634368777275085,
"learning_rate": 7.033966080363328e-05,
"loss": 0.3925,
"step": 12500
},
{
"epoch": 3.6880896226415096,
"grad_norm": 0.6643924117088318,
"learning_rate": 7.029722287169906e-05,
"loss": 0.3932,
"step": 12510
},
{
"epoch": 3.6910377358490565,
"grad_norm": 0.8129070997238159,
"learning_rate": 7.025476742561232e-05,
"loss": 0.375,
"step": 12520
},
{
"epoch": 3.6939858490566038,
"grad_norm": 0.7442275881767273,
"learning_rate": 7.021229450200714e-05,
"loss": 0.4033,
"step": 12530
},
{
"epoch": 3.696933962264151,
"grad_norm": 0.7939448356628418,
"learning_rate": 7.016980413753275e-05,
"loss": 0.3772,
"step": 12540
},
{
"epoch": 3.699882075471698,
"grad_norm": 0.5956901907920837,
"learning_rate": 7.012729636885345e-05,
"loss": 0.3722,
"step": 12550
},
{
"epoch": 3.702830188679245,
"grad_norm": 0.8104729056358337,
"learning_rate": 7.008477123264848e-05,
"loss": 0.3975,
"step": 12560
},
{
"epoch": 3.7057783018867925,
"grad_norm": 0.7263352870941162,
"learning_rate": 7.004222876561212e-05,
"loss": 0.4013,
"step": 12570
},
{
"epoch": 3.7087264150943398,
"grad_norm": 0.6080439686775208,
"learning_rate": 6.999966900445359e-05,
"loss": 0.3901,
"step": 12580
},
{
"epoch": 3.711674528301887,
"grad_norm": 0.671343982219696,
"learning_rate": 6.995709198589704e-05,
"loss": 0.3647,
"step": 12590
},
{
"epoch": 3.714622641509434,
"grad_norm": 0.651388943195343,
"learning_rate": 6.991449774668149e-05,
"loss": 0.3931,
"step": 12600
},
{
"epoch": 3.717570754716981,
"grad_norm": 0.6568498611450195,
"learning_rate": 6.987188632356086e-05,
"loss": 0.4037,
"step": 12610
},
{
"epoch": 3.7205188679245285,
"grad_norm": 0.6682468056678772,
"learning_rate": 6.982925775330385e-05,
"loss": 0.3956,
"step": 12620
},
{
"epoch": 3.7234669811320753,
"grad_norm": 0.5057411789894104,
"learning_rate": 6.978661207269399e-05,
"loss": 0.3882,
"step": 12630
},
{
"epoch": 3.7264150943396226,
"grad_norm": 0.8186591267585754,
"learning_rate": 6.974394931852956e-05,
"loss": 0.3811,
"step": 12640
},
{
"epoch": 3.72936320754717,
"grad_norm": 0.55955570936203,
"learning_rate": 6.97012695276236e-05,
"loss": 0.3799,
"step": 12650
},
{
"epoch": 3.732311320754717,
"grad_norm": 0.6058364510536194,
"learning_rate": 6.965857273680379e-05,
"loss": 0.3962,
"step": 12660
},
{
"epoch": 3.735259433962264,
"grad_norm": 0.7165114879608154,
"learning_rate": 6.961585898291251e-05,
"loss": 0.3661,
"step": 12670
},
{
"epoch": 3.7382075471698113,
"grad_norm": 0.48929327726364136,
"learning_rate": 6.957312830280685e-05,
"loss": 0.3868,
"step": 12680
},
{
"epoch": 3.7411556603773586,
"grad_norm": 0.5545490980148315,
"learning_rate": 6.953038073335834e-05,
"loss": 0.3876,
"step": 12690
},
{
"epoch": 3.7441037735849054,
"grad_norm": 0.6039651036262512,
"learning_rate": 6.948761631145327e-05,
"loss": 0.3867,
"step": 12700
},
{
"epoch": 3.7470518867924527,
"grad_norm": 0.6120061278343201,
"learning_rate": 6.944483507399233e-05,
"loss": 0.4068,
"step": 12710
},
{
"epoch": 3.75,
"grad_norm": 0.8350451588630676,
"learning_rate": 6.940203705789078e-05,
"loss": 0.3708,
"step": 12720
},
{
"epoch": 3.7529481132075473,
"grad_norm": 0.72921222448349,
"learning_rate": 6.935922230007837e-05,
"loss": 0.3935,
"step": 12730
},
{
"epoch": 3.7558962264150946,
"grad_norm": 0.7849156260490417,
"learning_rate": 6.931639083749927e-05,
"loss": 0.3796,
"step": 12740
},
{
"epoch": 3.7588443396226414,
"grad_norm": 0.4637618064880371,
"learning_rate": 6.927354270711206e-05,
"loss": 0.4087,
"step": 12750
},
{
"epoch": 3.7617924528301887,
"grad_norm": 0.5398821234703064,
"learning_rate": 6.923067794588972e-05,
"loss": 0.3737,
"step": 12760
},
{
"epoch": 3.764740566037736,
"grad_norm": 0.621241569519043,
"learning_rate": 6.918779659081959e-05,
"loss": 0.3863,
"step": 12770
},
{
"epoch": 3.767688679245283,
"grad_norm": 0.5977849364280701,
"learning_rate": 6.91448986789033e-05,
"loss": 0.3656,
"step": 12780
},
{
"epoch": 3.77063679245283,
"grad_norm": 0.702674925327301,
"learning_rate": 6.910198424715676e-05,
"loss": 0.4081,
"step": 12790
},
{
"epoch": 3.7735849056603774,
"grad_norm": 0.8087901473045349,
"learning_rate": 6.90590533326102e-05,
"loss": 0.3804,
"step": 12800
},
{
"epoch": 3.7765330188679247,
"grad_norm": 0.6610251069068909,
"learning_rate": 6.901610597230796e-05,
"loss": 0.3784,
"step": 12810
},
{
"epoch": 3.7794811320754715,
"grad_norm": 0.6378030776977539,
"learning_rate": 6.897314220330873e-05,
"loss": 0.4058,
"step": 12820
},
{
"epoch": 3.782429245283019,
"grad_norm": 0.5867325067520142,
"learning_rate": 6.893016206268518e-05,
"loss": 0.3941,
"step": 12830
},
{
"epoch": 3.785377358490566,
"grad_norm": 0.6321238875389099,
"learning_rate": 6.888716558752424e-05,
"loss": 0.4151,
"step": 12840
},
{
"epoch": 3.788325471698113,
"grad_norm": 0.5633270144462585,
"learning_rate": 6.884415281492687e-05,
"loss": 0.3874,
"step": 12850
},
{
"epoch": 3.7912735849056602,
"grad_norm": 0.5062634348869324,
"learning_rate": 6.880112378200812e-05,
"loss": 0.4013,
"step": 12860
},
{
"epoch": 3.7942216981132075,
"grad_norm": 0.6013985872268677,
"learning_rate": 6.875807852589707e-05,
"loss": 0.3871,
"step": 12870
},
{
"epoch": 3.797169811320755,
"grad_norm": 0.604705274105072,
"learning_rate": 6.871501708373675e-05,
"loss": 0.4019,
"step": 12880
},
{
"epoch": 3.800117924528302,
"grad_norm": 0.859552800655365,
"learning_rate": 6.867193949268426e-05,
"loss": 0.3865,
"step": 12890
},
{
"epoch": 3.803066037735849,
"grad_norm": 0.6676722764968872,
"learning_rate": 6.862884578991053e-05,
"loss": 0.3895,
"step": 12900
},
{
"epoch": 3.8060141509433962,
"grad_norm": 0.6155467629432678,
"learning_rate": 6.858573601260044e-05,
"loss": 0.3855,
"step": 12910
},
{
"epoch": 3.8089622641509435,
"grad_norm": 0.6465997099876404,
"learning_rate": 6.854261019795274e-05,
"loss": 0.3798,
"step": 12920
},
{
"epoch": 3.8119103773584904,
"grad_norm": 0.6289038062095642,
"learning_rate": 6.849946838318002e-05,
"loss": 0.361,
"step": 12930
},
{
"epoch": 3.8148584905660377,
"grad_norm": 0.8101239204406738,
"learning_rate": 6.845631060550865e-05,
"loss": 0.3829,
"step": 12940
},
{
"epoch": 3.817806603773585,
"grad_norm": 0.6101033091545105,
"learning_rate": 6.841313690217881e-05,
"loss": 0.3897,
"step": 12950
},
{
"epoch": 3.8207547169811322,
"grad_norm": 0.57803875207901,
"learning_rate": 6.836994731044441e-05,
"loss": 0.3883,
"step": 12960
},
{
"epoch": 3.8237028301886795,
"grad_norm": 0.7633528113365173,
"learning_rate": 6.832674186757305e-05,
"loss": 0.3686,
"step": 12970
},
{
"epoch": 3.8266509433962264,
"grad_norm": 0.6265213489532471,
"learning_rate": 6.828352061084603e-05,
"loss": 0.398,
"step": 12980
},
{
"epoch": 3.8295990566037736,
"grad_norm": 0.619233250617981,
"learning_rate": 6.82402835775583e-05,
"loss": 0.3788,
"step": 12990
},
{
"epoch": 3.8325471698113205,
"grad_norm": 0.6542940735816956,
"learning_rate": 6.819703080501838e-05,
"loss": 0.376,
"step": 13000
},
{
"epoch": 3.8325471698113205,
"eval_runtime": 2155.4586,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 0.525,
"step": 13000
},
{
"epoch": 3.8354952830188678,
"grad_norm": 0.6155036687850952,
"learning_rate": 6.815376233054844e-05,
"loss": 0.3698,
"step": 13010
},
{
"epoch": 3.838443396226415,
"grad_norm": 0.69612056016922,
"learning_rate": 6.811047819148413e-05,
"loss": 0.379,
"step": 13020
},
{
"epoch": 3.8413915094339623,
"grad_norm": 0.5194510221481323,
"learning_rate": 6.806717842517467e-05,
"loss": 0.3594,
"step": 13030
},
{
"epoch": 3.8443396226415096,
"grad_norm": 0.5976055860519409,
"learning_rate": 6.802386306898275e-05,
"loss": 0.3877,
"step": 13040
},
{
"epoch": 3.8472877358490565,
"grad_norm": 0.6758872866630554,
"learning_rate": 6.798053216028448e-05,
"loss": 0.3907,
"step": 13050
},
{
"epoch": 3.8502358490566038,
"grad_norm": 0.7552269101142883,
"learning_rate": 6.793718573646944e-05,
"loss": 0.3947,
"step": 13060
},
{
"epoch": 3.853183962264151,
"grad_norm": 0.7220373749732971,
"learning_rate": 6.789382383494057e-05,
"loss": 0.3753,
"step": 13070
},
{
"epoch": 3.856132075471698,
"grad_norm": 0.5295929312705994,
"learning_rate": 6.785044649311415e-05,
"loss": 0.3865,
"step": 13080
},
{
"epoch": 3.859080188679245,
"grad_norm": 0.6221385598182678,
"learning_rate": 6.780705374841981e-05,
"loss": 0.3999,
"step": 13090
},
{
"epoch": 3.8620283018867925,
"grad_norm": 0.6315896511077881,
"learning_rate": 6.776364563830047e-05,
"loss": 0.3795,
"step": 13100
},
{
"epoch": 3.8649764150943398,
"grad_norm": 0.6725174188613892,
"learning_rate": 6.77202222002123e-05,
"loss": 0.3969,
"step": 13110
},
{
"epoch": 3.867924528301887,
"grad_norm": 0.5970048308372498,
"learning_rate": 6.76767834716247e-05,
"loss": 0.3899,
"step": 13120
},
{
"epoch": 3.870872641509434,
"grad_norm": 0.6575031876564026,
"learning_rate": 6.763332949002026e-05,
"loss": 0.3941,
"step": 13130
},
{
"epoch": 3.873820754716981,
"grad_norm": 0.6206411719322205,
"learning_rate": 6.758986029289474e-05,
"loss": 0.3643,
"step": 13140
},
{
"epoch": 3.8767688679245285,
"grad_norm": 0.9830887913703918,
"learning_rate": 6.7546375917757e-05,
"loss": 0.3918,
"step": 13150
},
{
"epoch": 3.8797169811320753,
"grad_norm": 0.683879017829895,
"learning_rate": 6.750287640212903e-05,
"loss": 0.4018,
"step": 13160
},
{
"epoch": 3.8826650943396226,
"grad_norm": 0.6155050992965698,
"learning_rate": 6.745936178354588e-05,
"loss": 0.363,
"step": 13170
},
{
"epoch": 3.88561320754717,
"grad_norm": 0.7064786553382874,
"learning_rate": 6.741583209955564e-05,
"loss": 0.3712,
"step": 13180
},
{
"epoch": 3.888561320754717,
"grad_norm": 0.6027262210845947,
"learning_rate": 6.737228738771937e-05,
"loss": 0.3549,
"step": 13190
},
{
"epoch": 3.891509433962264,
"grad_norm": 0.5665203332901001,
"learning_rate": 6.73287276856111e-05,
"loss": 0.4045,
"step": 13200
},
{
"epoch": 3.8944575471698113,
"grad_norm": 0.7666698098182678,
"learning_rate": 6.728515303081781e-05,
"loss": 0.3962,
"step": 13210
},
{
"epoch": 3.8974056603773586,
"grad_norm": 0.6209610104560852,
"learning_rate": 6.724156346093942e-05,
"loss": 0.4021,
"step": 13220
},
{
"epoch": 3.9003537735849054,
"grad_norm": 0.6751357316970825,
"learning_rate": 6.719795901358864e-05,
"loss": 0.3836,
"step": 13230
},
{
"epoch": 3.9033018867924527,
"grad_norm": 0.5666335225105286,
"learning_rate": 6.715433972639106e-05,
"loss": 0.3622,
"step": 13240
},
{
"epoch": 3.90625,
"grad_norm": 0.6070656180381775,
"learning_rate": 6.711070563698508e-05,
"loss": 0.4011,
"step": 13250
},
{
"epoch": 3.9091981132075473,
"grad_norm": 0.5795000195503235,
"learning_rate": 6.706705678302187e-05,
"loss": 0.3873,
"step": 13260
},
{
"epoch": 3.9121462264150946,
"grad_norm": 0.6695361733436584,
"learning_rate": 6.702339320216534e-05,
"loss": 0.3896,
"step": 13270
},
{
"epoch": 3.9150943396226414,
"grad_norm": 0.7268232107162476,
"learning_rate": 6.69797149320921e-05,
"loss": 0.3798,
"step": 13280
},
{
"epoch": 3.9180424528301887,
"grad_norm": 0.688899040222168,
"learning_rate": 6.693602201049142e-05,
"loss": 0.366,
"step": 13290
},
{
"epoch": 3.920990566037736,
"grad_norm": 0.775772750377655,
"learning_rate": 6.689231447506526e-05,
"loss": 0.3983,
"step": 13300
},
{
"epoch": 3.923938679245283,
"grad_norm": 0.48991528153419495,
"learning_rate": 6.684859236352814e-05,
"loss": 0.3875,
"step": 13310
},
{
"epoch": 3.92688679245283,
"grad_norm": 0.6462919116020203,
"learning_rate": 6.68048557136072e-05,
"loss": 0.3702,
"step": 13320
},
{
"epoch": 3.9298349056603774,
"grad_norm": 0.6816546320915222,
"learning_rate": 6.676110456304207e-05,
"loss": 0.3724,
"step": 13330
},
{
"epoch": 3.9327830188679247,
"grad_norm": 0.6184906363487244,
"learning_rate": 6.671733894958496e-05,
"loss": 0.4002,
"step": 13340
},
{
"epoch": 3.9357311320754715,
"grad_norm": 0.6362095475196838,
"learning_rate": 6.667355891100049e-05,
"loss": 0.4209,
"step": 13350
},
{
"epoch": 3.938679245283019,
"grad_norm": 0.544964611530304,
"learning_rate": 6.662976448506578e-05,
"loss": 0.365,
"step": 13360
},
{
"epoch": 3.941627358490566,
"grad_norm": 0.8723105192184448,
"learning_rate": 6.658595570957038e-05,
"loss": 0.3624,
"step": 13370
},
{
"epoch": 3.944575471698113,
"grad_norm": 0.5339908599853516,
"learning_rate": 6.654213262231612e-05,
"loss": 0.3884,
"step": 13380
},
{
"epoch": 3.9475235849056602,
"grad_norm": 0.5904654860496521,
"learning_rate": 6.649829526111733e-05,
"loss": 0.4111,
"step": 13390
},
{
"epoch": 3.9504716981132075,
"grad_norm": 0.576021671295166,
"learning_rate": 6.64544436638005e-05,
"loss": 0.3985,
"step": 13400
},
{
"epoch": 3.953419811320755,
"grad_norm": 0.6284976601600647,
"learning_rate": 6.641057786820452e-05,
"loss": 0.3853,
"step": 13410
},
{
"epoch": 3.956367924528302,
"grad_norm": 0.9079892635345459,
"learning_rate": 6.63666979121805e-05,
"loss": 0.3695,
"step": 13420
},
{
"epoch": 3.959316037735849,
"grad_norm": 0.6591789722442627,
"learning_rate": 6.632280383359172e-05,
"loss": 0.391,
"step": 13430
},
{
"epoch": 3.9622641509433962,
"grad_norm": 0.7778935432434082,
"learning_rate": 6.627889567031373e-05,
"loss": 0.374,
"step": 13440
},
{
"epoch": 3.9652122641509435,
"grad_norm": 0.5455933213233948,
"learning_rate": 6.623497346023418e-05,
"loss": 0.4089,
"step": 13450
},
{
"epoch": 3.9681603773584904,
"grad_norm": 0.6310223937034607,
"learning_rate": 6.619103724125282e-05,
"loss": 0.3971,
"step": 13460
},
{
"epoch": 3.9711084905660377,
"grad_norm": 0.711181104183197,
"learning_rate": 6.614708705128154e-05,
"loss": 0.3966,
"step": 13470
},
{
"epoch": 3.974056603773585,
"grad_norm": 0.617135226726532,
"learning_rate": 6.610312292824427e-05,
"loss": 0.3643,
"step": 13480
},
{
"epoch": 3.9770047169811322,
"grad_norm": 0.5437451601028442,
"learning_rate": 6.605914491007695e-05,
"loss": 0.3626,
"step": 13490
},
{
"epoch": 3.9799528301886795,
"grad_norm": 0.6699962615966797,
"learning_rate": 6.601515303472752e-05,
"loss": 0.3966,
"step": 13500
}
],
"logging_steps": 10,
"max_steps": 33920,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0044657058122301e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}