Ours-RadFM-8ep-batch2-maxlen2048 / trainer_state.json
loopback-kr's picture
Upload folder using huggingface_hub
b4db4e8 verified
{
"best_metric": 0.01002925168722868,
"best_model_checkpoint": "/workspace/previous_works/RadFM/output/RadFM-Llama3-8B-pretrain-0002-embed_tokens-depth32-lora-8ep-maxlen2048/checkpoint-20000",
"epoch": 8.0,
"eval_steps": 10000,
"global_step": 38184,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008170961659333752,
"grad_norm": 42.493404388427734,
"learning_rate": 3.4031413612565448e-06,
"loss": 2.5663,
"step": 39
},
{
"epoch": 0.016341923318667503,
"grad_norm": 4.584639072418213,
"learning_rate": 6.8062827225130895e-06,
"loss": 1.8589,
"step": 78
},
{
"epoch": 0.02451288497800126,
"grad_norm": 4.140661239624023,
"learning_rate": 1.0209424083769634e-05,
"loss": 1.1348,
"step": 117
},
{
"epoch": 0.03268384663733501,
"grad_norm": 4.312882423400879,
"learning_rate": 1.3612565445026179e-05,
"loss": 0.8666,
"step": 156
},
{
"epoch": 0.04085480829666876,
"grad_norm": 5.689533710479736,
"learning_rate": 1.7015706806282724e-05,
"loss": 0.7726,
"step": 195
},
{
"epoch": 0.04902576995600252,
"grad_norm": 3.757542133331299,
"learning_rate": 2.0418848167539268e-05,
"loss": 0.7232,
"step": 234
},
{
"epoch": 0.05719673161533627,
"grad_norm": 3.461946487426758,
"learning_rate": 2.382198952879581e-05,
"loss": 0.6182,
"step": 273
},
{
"epoch": 0.06536769327467001,
"grad_norm": 2.7702128887176514,
"learning_rate": 2.7225130890052358e-05,
"loss": 0.6639,
"step": 312
},
{
"epoch": 0.07353865493400377,
"grad_norm": 3.7390189170837402,
"learning_rate": 3.0628272251308905e-05,
"loss": 0.5924,
"step": 351
},
{
"epoch": 0.08170961659333752,
"grad_norm": 2.9384899139404297,
"learning_rate": 3.403141361256545e-05,
"loss": 0.594,
"step": 390
},
{
"epoch": 0.08988057825267128,
"grad_norm": 4.2483930587768555,
"learning_rate": 3.743455497382199e-05,
"loss": 0.5782,
"step": 429
},
{
"epoch": 0.09805153991200503,
"grad_norm": 2.8193845748901367,
"learning_rate": 4.0837696335078535e-05,
"loss": 0.6084,
"step": 468
},
{
"epoch": 0.10622250157133878,
"grad_norm": 2.6130402088165283,
"learning_rate": 4.424083769633508e-05,
"loss": 0.5876,
"step": 507
},
{
"epoch": 0.11439346323067254,
"grad_norm": 2.0634474754333496,
"learning_rate": 4.764397905759162e-05,
"loss": 0.5596,
"step": 546
},
{
"epoch": 0.12256442489000628,
"grad_norm": 3.275634288787842,
"learning_rate": 5.104712041884817e-05,
"loss": 0.6195,
"step": 585
},
{
"epoch": 0.13073538654934003,
"grad_norm": 2.8859968185424805,
"learning_rate": 5.4450261780104716e-05,
"loss": 0.5626,
"step": 624
},
{
"epoch": 0.13890634820867379,
"grad_norm": 2.7095537185668945,
"learning_rate": 5.785340314136126e-05,
"loss": 0.5534,
"step": 663
},
{
"epoch": 0.14707730986800754,
"grad_norm": 2.249742031097412,
"learning_rate": 6.125654450261781e-05,
"loss": 0.5859,
"step": 702
},
{
"epoch": 0.1552482715273413,
"grad_norm": 3.251708745956421,
"learning_rate": 6.465968586387435e-05,
"loss": 0.5644,
"step": 741
},
{
"epoch": 0.16341923318667503,
"grad_norm": 2.18890380859375,
"learning_rate": 6.80628272251309e-05,
"loss": 0.5465,
"step": 780
},
{
"epoch": 0.1715901948460088,
"grad_norm": 1.5962858200073242,
"learning_rate": 7.146596858638743e-05,
"loss": 0.5377,
"step": 819
},
{
"epoch": 0.17976115650534255,
"grad_norm": 1.8569731712341309,
"learning_rate": 7.486910994764398e-05,
"loss": 0.5627,
"step": 858
},
{
"epoch": 0.1879321181646763,
"grad_norm": 2.0382535457611084,
"learning_rate": 7.827225130890053e-05,
"loss": 0.5778,
"step": 897
},
{
"epoch": 0.19610307982401007,
"grad_norm": 2.0339195728302,
"learning_rate": 8.167539267015707e-05,
"loss": 0.5407,
"step": 936
},
{
"epoch": 0.2042740414833438,
"grad_norm": 2.1229023933410645,
"learning_rate": 8.507853403141361e-05,
"loss": 0.6132,
"step": 975
},
{
"epoch": 0.21244500314267756,
"grad_norm": 1.6219509840011597,
"learning_rate": 8.848167539267016e-05,
"loss": 0.6153,
"step": 1014
},
{
"epoch": 0.22061596480201132,
"grad_norm": 2.0772364139556885,
"learning_rate": 9.18848167539267e-05,
"loss": 0.533,
"step": 1053
},
{
"epoch": 0.22878692646134507,
"grad_norm": 2.073230504989624,
"learning_rate": 9.528795811518324e-05,
"loss": 0.5661,
"step": 1092
},
{
"epoch": 0.2369578881206788,
"grad_norm": 1.96063232421875,
"learning_rate": 9.86910994764398e-05,
"loss": 0.5962,
"step": 1131
},
{
"epoch": 0.24512884978001256,
"grad_norm": 1.5799710750579834,
"learning_rate": 9.999989639826398e-05,
"loss": 0.552,
"step": 1170
},
{
"epoch": 0.2532998114393463,
"grad_norm": 1.5227395296096802,
"learning_rate": 9.999928612073995e-05,
"loss": 0.5739,
"step": 1209
},
{
"epoch": 0.26147077309868005,
"grad_norm": 1.7215867042541504,
"learning_rate": 9.99981287046695e-05,
"loss": 0.5363,
"step": 1248
},
{
"epoch": 0.26964173475801384,
"grad_norm": 1.5819590091705322,
"learning_rate": 9.999642416271812e-05,
"loss": 0.5223,
"step": 1287
},
{
"epoch": 0.27781269641734757,
"grad_norm": 1.3706092834472656,
"learning_rate": 9.999417251353851e-05,
"loss": 0.5236,
"step": 1326
},
{
"epoch": 0.28598365807668136,
"grad_norm": 1.57364022731781,
"learning_rate": 9.999137378177029e-05,
"loss": 0.5454,
"step": 1365
},
{
"epoch": 0.2941546197360151,
"grad_norm": 1.2142678499221802,
"learning_rate": 9.998802799803979e-05,
"loss": 0.5247,
"step": 1404
},
{
"epoch": 0.3023255813953488,
"grad_norm": 1.275280237197876,
"learning_rate": 9.998413519895968e-05,
"loss": 0.5048,
"step": 1443
},
{
"epoch": 0.3104965430546826,
"grad_norm": 1.4138684272766113,
"learning_rate": 9.997969542712856e-05,
"loss": 0.5675,
"step": 1482
},
{
"epoch": 0.31866750471401634,
"grad_norm": 1.2247836589813232,
"learning_rate": 9.997470873113055e-05,
"loss": 0.523,
"step": 1521
},
{
"epoch": 0.32683846637335007,
"grad_norm": 1.1925996541976929,
"learning_rate": 9.996917516553468e-05,
"loss": 0.5555,
"step": 1560
},
{
"epoch": 0.33500942803268385,
"grad_norm": 1.0786908864974976,
"learning_rate": 9.996309479089436e-05,
"loss": 0.4793,
"step": 1599
},
{
"epoch": 0.3431803896920176,
"grad_norm": 1.1928977966308594,
"learning_rate": 9.995646767374671e-05,
"loss": 0.4938,
"step": 1638
},
{
"epoch": 0.35135135135135137,
"grad_norm": 1.0230952501296997,
"learning_rate": 9.994929388661176e-05,
"loss": 0.5405,
"step": 1677
},
{
"epoch": 0.3595223130106851,
"grad_norm": 1.436375379562378,
"learning_rate": 9.994157350799176e-05,
"loss": 0.5168,
"step": 1716
},
{
"epoch": 0.36769327467001883,
"grad_norm": 1.0002754926681519,
"learning_rate": 9.993330662237024e-05,
"loss": 0.5547,
"step": 1755
},
{
"epoch": 0.3758642363293526,
"grad_norm": 1.4565436840057373,
"learning_rate": 9.992449332021114e-05,
"loss": 0.5013,
"step": 1794
},
{
"epoch": 0.38403519798868635,
"grad_norm": 0.9139441251754761,
"learning_rate": 9.991513369795777e-05,
"loss": 0.5084,
"step": 1833
},
{
"epoch": 0.39220615964802014,
"grad_norm": 1.054423451423645,
"learning_rate": 9.99052278580318e-05,
"loss": 0.5277,
"step": 1872
},
{
"epoch": 0.40037712130735387,
"grad_norm": 2.1869988441467285,
"learning_rate": 9.989477590883211e-05,
"loss": 0.5121,
"step": 1911
},
{
"epoch": 0.4085480829666876,
"grad_norm": 1.1452125310897827,
"learning_rate": 9.988377796473363e-05,
"loss": 0.4957,
"step": 1950
},
{
"epoch": 0.4167190446260214,
"grad_norm": 0.8906638622283936,
"learning_rate": 9.987223414608605e-05,
"loss": 0.4706,
"step": 1989
},
{
"epoch": 0.4248900062853551,
"grad_norm": 1.0864148139953613,
"learning_rate": 9.986014457921253e-05,
"loss": 0.5475,
"step": 2028
},
{
"epoch": 0.4330609679446889,
"grad_norm": 0.9262130856513977,
"learning_rate": 9.984750939640835e-05,
"loss": 0.5052,
"step": 2067
},
{
"epoch": 0.44123192960402263,
"grad_norm": 0.8391928672790527,
"learning_rate": 9.983432873593937e-05,
"loss": 0.4524,
"step": 2106
},
{
"epoch": 0.44940289126335636,
"grad_norm": 0.9878028631210327,
"learning_rate": 9.98206027420406e-05,
"loss": 0.4461,
"step": 2145
},
{
"epoch": 0.45757385292269015,
"grad_norm": 0.9766435623168945,
"learning_rate": 9.980633156491459e-05,
"loss": 0.4987,
"step": 2184
},
{
"epoch": 0.4657448145820239,
"grad_norm": 0.7703444957733154,
"learning_rate": 9.979151536072982e-05,
"loss": 0.4865,
"step": 2223
},
{
"epoch": 0.4739157762413576,
"grad_norm": 1.0184051990509033,
"learning_rate": 9.977615429161888e-05,
"loss": 0.4715,
"step": 2262
},
{
"epoch": 0.4820867379006914,
"grad_norm": 0.8845427632331848,
"learning_rate": 9.976024852567689e-05,
"loss": 0.4961,
"step": 2301
},
{
"epoch": 0.49025769956002513,
"grad_norm": 1.0399807691574097,
"learning_rate": 9.974379823695944e-05,
"loss": 0.5169,
"step": 2340
},
{
"epoch": 0.4984286612193589,
"grad_norm": 0.91423499584198,
"learning_rate": 9.972680360548085e-05,
"loss": 0.4783,
"step": 2379
},
{
"epoch": 0.5065996228786926,
"grad_norm": 0.9913365840911865,
"learning_rate": 9.970926481721216e-05,
"loss": 0.4726,
"step": 2418
},
{
"epoch": 0.5147705845380264,
"grad_norm": 0.9079034924507141,
"learning_rate": 9.969118206407905e-05,
"loss": 0.4486,
"step": 2457
},
{
"epoch": 0.5229415461973601,
"grad_norm": 0.9283367395401001,
"learning_rate": 9.967255554395976e-05,
"loss": 0.4654,
"step": 2496
},
{
"epoch": 0.531112507856694,
"grad_norm": 0.8894864916801453,
"learning_rate": 9.965338546068292e-05,
"loss": 0.4977,
"step": 2535
},
{
"epoch": 0.5392834695160277,
"grad_norm": 0.913196861743927,
"learning_rate": 9.963367202402538e-05,
"loss": 0.5067,
"step": 2574
},
{
"epoch": 0.5474544311753614,
"grad_norm": 0.9744309186935425,
"learning_rate": 9.961341544970984e-05,
"loss": 0.4797,
"step": 2613
},
{
"epoch": 0.5556253928346951,
"grad_norm": 1.048804759979248,
"learning_rate": 9.959261595940252e-05,
"loss": 0.4383,
"step": 2652
},
{
"epoch": 0.5637963544940289,
"grad_norm": 1.0824997425079346,
"learning_rate": 9.957127378071072e-05,
"loss": 0.4866,
"step": 2691
},
{
"epoch": 0.5719673161533627,
"grad_norm": 1.0387648344039917,
"learning_rate": 9.954938914718035e-05,
"loss": 0.4659,
"step": 2730
},
{
"epoch": 0.5801382778126964,
"grad_norm": 0.8543350100517273,
"learning_rate": 9.952696229829335e-05,
"loss": 0.4576,
"step": 2769
},
{
"epoch": 0.5883092394720302,
"grad_norm": 0.8038038015365601,
"learning_rate": 9.950399347946512e-05,
"loss": 0.4594,
"step": 2808
},
{
"epoch": 0.5964802011313639,
"grad_norm": 1.0086504220962524,
"learning_rate": 9.948048294204175e-05,
"loss": 0.4974,
"step": 2847
},
{
"epoch": 0.6046511627906976,
"grad_norm": 0.9558520913124084,
"learning_rate": 9.945643094329735e-05,
"loss": 0.4723,
"step": 2886
},
{
"epoch": 0.6128221244500315,
"grad_norm": 1.0693845748901367,
"learning_rate": 9.943183774643116e-05,
"loss": 0.4563,
"step": 2925
},
{
"epoch": 0.6209930861093652,
"grad_norm": 0.8957254886627197,
"learning_rate": 9.94067036205648e-05,
"loss": 0.4668,
"step": 2964
},
{
"epoch": 0.6291640477686989,
"grad_norm": 0.8896434903144836,
"learning_rate": 9.938102884073914e-05,
"loss": 0.4707,
"step": 3003
},
{
"epoch": 0.6373350094280327,
"grad_norm": 1.0319199562072754,
"learning_rate": 9.935481368791141e-05,
"loss": 0.484,
"step": 3042
},
{
"epoch": 0.6455059710873664,
"grad_norm": 0.8816359043121338,
"learning_rate": 9.932805844895216e-05,
"loss": 0.475,
"step": 3081
},
{
"epoch": 0.6536769327467001,
"grad_norm": 0.7812872529029846,
"learning_rate": 9.930076341664201e-05,
"loss": 0.4838,
"step": 3120
},
{
"epoch": 0.661847894406034,
"grad_norm": 1.0631335973739624,
"learning_rate": 9.927292888966848e-05,
"loss": 0.4791,
"step": 3159
},
{
"epoch": 0.6700188560653677,
"grad_norm": 0.9721090793609619,
"learning_rate": 9.92445551726228e-05,
"loss": 0.4701,
"step": 3198
},
{
"epoch": 0.6781898177247014,
"grad_norm": 0.9454875588417053,
"learning_rate": 9.921564257599649e-05,
"loss": 0.4397,
"step": 3237
},
{
"epoch": 0.6863607793840352,
"grad_norm": 0.6671223044395447,
"learning_rate": 9.918619141617797e-05,
"loss": 0.4557,
"step": 3276
},
{
"epoch": 0.6945317410433689,
"grad_norm": 0.6020123958587646,
"learning_rate": 9.915620201544915e-05,
"loss": 0.4454,
"step": 3315
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.5989274382591248,
"learning_rate": 9.912567470198186e-05,
"loss": 0.4762,
"step": 3354
},
{
"epoch": 0.7108736643620365,
"grad_norm": 0.9229285717010498,
"learning_rate": 9.909460980983427e-05,
"loss": 0.4928,
"step": 3393
},
{
"epoch": 0.7190446260213702,
"grad_norm": 0.8499952554702759,
"learning_rate": 9.906300767894721e-05,
"loss": 0.4491,
"step": 3432
},
{
"epoch": 0.7272155876807039,
"grad_norm": 0.801685631275177,
"learning_rate": 9.903086865514053e-05,
"loss": 0.4496,
"step": 3471
},
{
"epoch": 0.7353865493400377,
"grad_norm": 1.049750566482544,
"learning_rate": 9.89981930901092e-05,
"loss": 0.4741,
"step": 3510
},
{
"epoch": 0.7435575109993715,
"grad_norm": 0.7072298526763916,
"learning_rate": 9.896498134141957e-05,
"loss": 0.454,
"step": 3549
},
{
"epoch": 0.7517284726587052,
"grad_norm": 0.674797773361206,
"learning_rate": 9.893123377250535e-05,
"loss": 0.4244,
"step": 3588
},
{
"epoch": 0.759899434318039,
"grad_norm": 0.6849934458732605,
"learning_rate": 9.889695075266377e-05,
"loss": 0.4631,
"step": 3627
},
{
"epoch": 0.7680703959773727,
"grad_norm": 0.6002283692359924,
"learning_rate": 9.88621326570514e-05,
"loss": 0.4936,
"step": 3666
},
{
"epoch": 0.7762413576367064,
"grad_norm": 0.8119000196456909,
"learning_rate": 9.882677986668014e-05,
"loss": 0.471,
"step": 3705
},
{
"epoch": 0.7844123192960403,
"grad_norm": 0.8308870792388916,
"learning_rate": 9.879089276841297e-05,
"loss": 0.4419,
"step": 3744
},
{
"epoch": 0.792583280955374,
"grad_norm": 0.7574063539505005,
"learning_rate": 9.875447175495983e-05,
"loss": 0.4286,
"step": 3783
},
{
"epoch": 0.8007542426147077,
"grad_norm": 0.7497377991676331,
"learning_rate": 9.871751722487317e-05,
"loss": 0.4773,
"step": 3822
},
{
"epoch": 0.8089252042740415,
"grad_norm": 0.761298418045044,
"learning_rate": 9.868002958254377e-05,
"loss": 0.4813,
"step": 3861
},
{
"epoch": 0.8170961659333752,
"grad_norm": 0.9213526844978333,
"learning_rate": 9.864200923819613e-05,
"loss": 0.4777,
"step": 3900
},
{
"epoch": 0.825267127592709,
"grad_norm": 0.9028540849685669,
"learning_rate": 9.860345660788414e-05,
"loss": 0.4302,
"step": 3939
},
{
"epoch": 0.8334380892520428,
"grad_norm": 0.6980260610580444,
"learning_rate": 9.856437211348641e-05,
"loss": 0.4283,
"step": 3978
},
{
"epoch": 0.8416090509113765,
"grad_norm": 0.6451707482337952,
"learning_rate": 9.852475618270172e-05,
"loss": 0.425,
"step": 4017
},
{
"epoch": 0.8497800125707102,
"grad_norm": 0.712598443031311,
"learning_rate": 9.848460924904432e-05,
"loss": 0.4638,
"step": 4056
},
{
"epoch": 0.857950974230044,
"grad_norm": 0.614088773727417,
"learning_rate": 9.844393175183917e-05,
"loss": 0.4557,
"step": 4095
},
{
"epoch": 0.8661219358893778,
"grad_norm": 0.8028059601783752,
"learning_rate": 9.840272413621716e-05,
"loss": 0.4699,
"step": 4134
},
{
"epoch": 0.8742928975487115,
"grad_norm": 0.8500687479972839,
"learning_rate": 9.836098685311024e-05,
"loss": 0.4392,
"step": 4173
},
{
"epoch": 0.8824638592080453,
"grad_norm": 0.7735748291015625,
"learning_rate": 9.831872035924645e-05,
"loss": 0.4197,
"step": 4212
},
{
"epoch": 0.890634820867379,
"grad_norm": 0.6959074139595032,
"learning_rate": 9.827592511714493e-05,
"loss": 0.4347,
"step": 4251
},
{
"epoch": 0.8988057825267127,
"grad_norm": 0.5992249846458435,
"learning_rate": 9.823260159511096e-05,
"loss": 0.4327,
"step": 4290
},
{
"epoch": 0.9069767441860465,
"grad_norm": 0.591803252696991,
"learning_rate": 9.818875026723063e-05,
"loss": 0.463,
"step": 4329
},
{
"epoch": 0.9151477058453803,
"grad_norm": 0.7650397419929504,
"learning_rate": 9.814437161336583e-05,
"loss": 0.4393,
"step": 4368
},
{
"epoch": 0.923318667504714,
"grad_norm": 0.7083756923675537,
"learning_rate": 9.809946611914896e-05,
"loss": 0.4431,
"step": 4407
},
{
"epoch": 0.9314896291640478,
"grad_norm": 0.5570526719093323,
"learning_rate": 9.805403427597757e-05,
"loss": 0.4293,
"step": 4446
},
{
"epoch": 0.9396605908233815,
"grad_norm": 0.7944348454475403,
"learning_rate": 9.800807658100902e-05,
"loss": 0.4331,
"step": 4485
},
{
"epoch": 0.9478315524827152,
"grad_norm": 0.9344629645347595,
"learning_rate": 9.796159353715498e-05,
"loss": 0.4461,
"step": 4524
},
{
"epoch": 0.9560025141420491,
"grad_norm": 0.6917528510093689,
"learning_rate": 9.791458565307604e-05,
"loss": 0.4441,
"step": 4563
},
{
"epoch": 0.9641734758013828,
"grad_norm": 0.6871070861816406,
"learning_rate": 9.786705344317606e-05,
"loss": 0.4461,
"step": 4602
},
{
"epoch": 0.9723444374607165,
"grad_norm": 0.7774878740310669,
"learning_rate": 9.781899742759652e-05,
"loss": 0.4693,
"step": 4641
},
{
"epoch": 0.9805153991200503,
"grad_norm": 0.7668758630752563,
"learning_rate": 9.777041813221095e-05,
"loss": 0.4073,
"step": 4680
},
{
"epoch": 0.988686360779384,
"grad_norm": 0.7705347537994385,
"learning_rate": 9.772131608861899e-05,
"loss": 0.474,
"step": 4719
},
{
"epoch": 0.9968573224387178,
"grad_norm": 0.8965923190116882,
"learning_rate": 9.767169183414075e-05,
"loss": 0.4741,
"step": 4758
},
{
"epoch": 1.0050282840980516,
"grad_norm": 0.7429456114768982,
"learning_rate": 9.762154591181083e-05,
"loss": 0.4326,
"step": 4797
},
{
"epoch": 1.0131992457573853,
"grad_norm": 0.8201857209205627,
"learning_rate": 9.757087887037241e-05,
"loss": 0.4252,
"step": 4836
},
{
"epoch": 1.021370207416719,
"grad_norm": 0.9046009182929993,
"learning_rate": 9.751969126427122e-05,
"loss": 0.4176,
"step": 4875
},
{
"epoch": 1.0295411690760528,
"grad_norm": 0.9283869862556458,
"learning_rate": 9.746798365364952e-05,
"loss": 0.4024,
"step": 4914
},
{
"epoch": 1.0377121307353865,
"grad_norm": 0.8236905932426453,
"learning_rate": 9.741575660433993e-05,
"loss": 0.409,
"step": 4953
},
{
"epoch": 1.0458830923947202,
"grad_norm": 0.713076651096344,
"learning_rate": 9.736301068785923e-05,
"loss": 0.4466,
"step": 4992
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.8006173372268677,
"learning_rate": 9.730974648140214e-05,
"loss": 0.3786,
"step": 5031
},
{
"epoch": 1.062225015713388,
"grad_norm": 0.924655020236969,
"learning_rate": 9.725596456783502e-05,
"loss": 0.4276,
"step": 5070
},
{
"epoch": 1.0703959773727216,
"grad_norm": 0.7090715765953064,
"learning_rate": 9.72016655356894e-05,
"loss": 0.3978,
"step": 5109
},
{
"epoch": 1.0785669390320554,
"grad_norm": 0.8049948215484619,
"learning_rate": 9.714684997915566e-05,
"loss": 0.423,
"step": 5148
},
{
"epoch": 1.086737900691389,
"grad_norm": 0.7052054405212402,
"learning_rate": 9.709151849807643e-05,
"loss": 0.3803,
"step": 5187
},
{
"epoch": 1.0949088623507228,
"grad_norm": 1.0038681030273438,
"learning_rate": 9.703567169794008e-05,
"loss": 0.4221,
"step": 5226
},
{
"epoch": 1.1030798240100566,
"grad_norm": 0.6527186036109924,
"learning_rate": 9.697931018987408e-05,
"loss": 0.3858,
"step": 5265
},
{
"epoch": 1.1112507856693903,
"grad_norm": 0.7536753416061401,
"learning_rate": 9.69224345906383e-05,
"loss": 0.3981,
"step": 5304
},
{
"epoch": 1.119421747328724,
"grad_norm": 1.1964749097824097,
"learning_rate": 9.68650455226183e-05,
"loss": 0.3858,
"step": 5343
},
{
"epoch": 1.1275927089880577,
"grad_norm": 0.717378556728363,
"learning_rate": 9.680714361381844e-05,
"loss": 0.4056,
"step": 5382
},
{
"epoch": 1.1357636706473917,
"grad_norm": 0.8927372694015503,
"learning_rate": 9.674872949785511e-05,
"loss": 0.3831,
"step": 5421
},
{
"epoch": 1.1439346323067254,
"grad_norm": 0.8325986266136169,
"learning_rate": 9.668980381394972e-05,
"loss": 0.3996,
"step": 5460
},
{
"epoch": 1.1521055939660592,
"grad_norm": 0.8707286715507507,
"learning_rate": 9.663036720692175e-05,
"loss": 0.4171,
"step": 5499
},
{
"epoch": 1.160276555625393,
"grad_norm": 0.6361021399497986,
"learning_rate": 9.657042032718165e-05,
"loss": 0.4033,
"step": 5538
},
{
"epoch": 1.1684475172847266,
"grad_norm": 0.9237117171287537,
"learning_rate": 9.650996383072375e-05,
"loss": 0.3497,
"step": 5577
},
{
"epoch": 1.1766184789440604,
"grad_norm": 0.7840002179145813,
"learning_rate": 9.644899837911912e-05,
"loss": 0.3789,
"step": 5616
},
{
"epoch": 1.184789440603394,
"grad_norm": 0.6757360100746155,
"learning_rate": 9.638752463950821e-05,
"loss": 0.4085,
"step": 5655
},
{
"epoch": 1.1929604022627278,
"grad_norm": 0.8231393098831177,
"learning_rate": 9.632554328459371e-05,
"loss": 0.4015,
"step": 5694
},
{
"epoch": 1.2011313639220615,
"grad_norm": 1.001394271850586,
"learning_rate": 9.626305499263307e-05,
"loss": 0.3671,
"step": 5733
},
{
"epoch": 1.2093023255813953,
"grad_norm": 0.7362861633300781,
"learning_rate": 9.620006044743111e-05,
"loss": 0.4108,
"step": 5772
},
{
"epoch": 1.217473287240729,
"grad_norm": 0.873781144618988,
"learning_rate": 9.613656033833255e-05,
"loss": 0.3959,
"step": 5811
},
{
"epoch": 1.2256442489000627,
"grad_norm": 0.745134174823761,
"learning_rate": 9.607255536021445e-05,
"loss": 0.4003,
"step": 5850
},
{
"epoch": 1.2338152105593967,
"grad_norm": 0.9675105810165405,
"learning_rate": 9.600804621347865e-05,
"loss": 0.4446,
"step": 5889
},
{
"epoch": 1.2419861722187304,
"grad_norm": 0.6674776673316956,
"learning_rate": 9.594303360404401e-05,
"loss": 0.3975,
"step": 5928
},
{
"epoch": 1.2501571338780642,
"grad_norm": 1.0288015604019165,
"learning_rate": 9.587751824333882e-05,
"loss": 0.371,
"step": 5967
},
{
"epoch": 1.2583280955373979,
"grad_norm": 1.1984747648239136,
"learning_rate": 9.581150084829287e-05,
"loss": 0.391,
"step": 6006
},
{
"epoch": 1.2664990571967316,
"grad_norm": 0.7332170009613037,
"learning_rate": 9.574498214132971e-05,
"loss": 0.4048,
"step": 6045
},
{
"epoch": 1.2746700188560653,
"grad_norm": 0.7564400434494019,
"learning_rate": 9.56779628503587e-05,
"loss": 0.4403,
"step": 6084
},
{
"epoch": 1.282840980515399,
"grad_norm": 0.8034684658050537,
"learning_rate": 9.561044370876709e-05,
"loss": 0.3841,
"step": 6123
},
{
"epoch": 1.2910119421747328,
"grad_norm": 0.8448123335838318,
"learning_rate": 9.55424254554119e-05,
"loss": 0.3906,
"step": 6162
},
{
"epoch": 1.2991829038340668,
"grad_norm": 1.2661123275756836,
"learning_rate": 9.547390883461194e-05,
"loss": 0.3858,
"step": 6201
},
{
"epoch": 1.3073538654934005,
"grad_norm": 0.7595413327217102,
"learning_rate": 9.54048945961396e-05,
"loss": 0.4283,
"step": 6240
},
{
"epoch": 1.3155248271527342,
"grad_norm": 0.8576300740242004,
"learning_rate": 9.533538349521263e-05,
"loss": 0.3916,
"step": 6279
},
{
"epoch": 1.323695788812068,
"grad_norm": 0.5660988092422485,
"learning_rate": 9.526537629248598e-05,
"loss": 0.3639,
"step": 6318
},
{
"epoch": 1.3318667504714017,
"grad_norm": 0.7187468409538269,
"learning_rate": 9.519487375404337e-05,
"loss": 0.3861,
"step": 6357
},
{
"epoch": 1.3400377121307354,
"grad_norm": 0.8099279999732971,
"learning_rate": 9.512387665138894e-05,
"loss": 0.3975,
"step": 6396
},
{
"epoch": 1.3482086737900691,
"grad_norm": 0.8016268610954285,
"learning_rate": 9.50523857614388e-05,
"loss": 0.3751,
"step": 6435
},
{
"epoch": 1.3563796354494029,
"grad_norm": 0.6431599855422974,
"learning_rate": 9.498040186651258e-05,
"loss": 0.4014,
"step": 6474
},
{
"epoch": 1.3645505971087366,
"grad_norm": 0.8925536274909973,
"learning_rate": 9.490792575432475e-05,
"loss": 0.3871,
"step": 6513
},
{
"epoch": 1.3727215587680703,
"grad_norm": 0.8553928136825562,
"learning_rate": 9.483495821797619e-05,
"loss": 0.3949,
"step": 6552
},
{
"epoch": 1.380892520427404,
"grad_norm": 0.8019265532493591,
"learning_rate": 9.476150005594528e-05,
"loss": 0.4086,
"step": 6591
},
{
"epoch": 1.3890634820867378,
"grad_norm": 0.7925927639007568,
"learning_rate": 9.468755207207937e-05,
"loss": 0.4024,
"step": 6630
},
{
"epoch": 1.3972344437460715,
"grad_norm": 0.7827038168907166,
"learning_rate": 9.461311507558586e-05,
"loss": 0.421,
"step": 6669
},
{
"epoch": 1.4054054054054055,
"grad_norm": 0.5531708598136902,
"learning_rate": 9.453818988102336e-05,
"loss": 0.4183,
"step": 6708
},
{
"epoch": 1.4135763670647392,
"grad_norm": 0.7605539560317993,
"learning_rate": 9.446277730829284e-05,
"loss": 0.4314,
"step": 6747
},
{
"epoch": 1.421747328724073,
"grad_norm": 0.49756988883018494,
"learning_rate": 9.438687818262857e-05,
"loss": 0.4118,
"step": 6786
},
{
"epoch": 1.4299182903834067,
"grad_norm": 0.9179204106330872,
"learning_rate": 9.431049333458917e-05,
"loss": 0.4027,
"step": 6825
},
{
"epoch": 1.4380892520427404,
"grad_norm": 0.7112446427345276,
"learning_rate": 9.423362360004848e-05,
"loss": 0.3961,
"step": 6864
},
{
"epoch": 1.4462602137020741,
"grad_norm": 0.6417878866195679,
"learning_rate": 9.415626982018637e-05,
"loss": 0.426,
"step": 6903
},
{
"epoch": 1.4544311753614079,
"grad_norm": 0.7054896950721741,
"learning_rate": 9.407843284147966e-05,
"loss": 0.3592,
"step": 6942
},
{
"epoch": 1.4626021370207416,
"grad_norm": 0.3770501911640167,
"learning_rate": 9.400011351569272e-05,
"loss": 0.4205,
"step": 6981
},
{
"epoch": 1.4707730986800756,
"grad_norm": 0.7424401044845581,
"learning_rate": 9.392131269986821e-05,
"loss": 0.3862,
"step": 7020
},
{
"epoch": 1.4789440603394093,
"grad_norm": 0.7774800062179565,
"learning_rate": 9.384203125631774e-05,
"loss": 0.4186,
"step": 7059
},
{
"epoch": 1.487115021998743,
"grad_norm": 0.6684144735336304,
"learning_rate": 9.376227005261237e-05,
"loss": 0.3793,
"step": 7098
},
{
"epoch": 1.4952859836580767,
"grad_norm": 0.8839356899261475,
"learning_rate": 9.368202996157314e-05,
"loss": 0.3927,
"step": 7137
},
{
"epoch": 1.5034569453174105,
"grad_norm": 0.7853700518608093,
"learning_rate": 9.36013118612615e-05,
"loss": 0.3842,
"step": 7176
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.9322993159294128,
"learning_rate": 9.35201166349698e-05,
"loss": 0.3737,
"step": 7215
},
{
"epoch": 1.519798868636078,
"grad_norm": 0.7737587094306946,
"learning_rate": 9.343844517121145e-05,
"loss": 0.3681,
"step": 7254
},
{
"epoch": 1.5279698302954117,
"grad_norm": 0.7644033432006836,
"learning_rate": 9.335629836371132e-05,
"loss": 0.3786,
"step": 7293
},
{
"epoch": 1.5361407919547454,
"grad_norm": 0.7909252643585205,
"learning_rate": 9.327367711139596e-05,
"loss": 0.4039,
"step": 7332
},
{
"epoch": 1.5443117536140791,
"grad_norm": 0.5451653599739075,
"learning_rate": 9.31905823183837e-05,
"loss": 0.3507,
"step": 7371
},
{
"epoch": 1.5524827152734129,
"grad_norm": 0.7788479328155518,
"learning_rate": 9.310701489397485e-05,
"loss": 0.3732,
"step": 7410
},
{
"epoch": 1.5606536769327466,
"grad_norm": 0.8460758924484253,
"learning_rate": 9.302297575264159e-05,
"loss": 0.3692,
"step": 7449
},
{
"epoch": 1.5688246385920803,
"grad_norm": 0.8979092240333557,
"learning_rate": 9.293846581401815e-05,
"loss": 0.4246,
"step": 7488
},
{
"epoch": 1.576995600251414,
"grad_norm": 0.8408267498016357,
"learning_rate": 9.285348600289063e-05,
"loss": 0.4018,
"step": 7527
},
{
"epoch": 1.585166561910748,
"grad_norm": 0.9224086999893188,
"learning_rate": 9.276803724918692e-05,
"loss": 0.3774,
"step": 7566
},
{
"epoch": 1.5933375235700817,
"grad_norm": 0.760299563407898,
"learning_rate": 9.268212048796652e-05,
"loss": 0.4074,
"step": 7605
},
{
"epoch": 1.6015084852294155,
"grad_norm": 0.6859989166259766,
"learning_rate": 9.259573665941027e-05,
"loss": 0.4017,
"step": 7644
},
{
"epoch": 1.6096794468887492,
"grad_norm": 0.5509856939315796,
"learning_rate": 9.250888670881011e-05,
"loss": 0.3785,
"step": 7683
},
{
"epoch": 1.617850408548083,
"grad_norm": 0.8195124864578247,
"learning_rate": 9.242157158655875e-05,
"loss": 0.3985,
"step": 7722
},
{
"epoch": 1.6260213702074169,
"grad_norm": 0.6665972471237183,
"learning_rate": 9.23337922481392e-05,
"loss": 0.3827,
"step": 7761
},
{
"epoch": 1.6341923318667506,
"grad_norm": 0.7980179786682129,
"learning_rate": 9.224554965411435e-05,
"loss": 0.3854,
"step": 7800
},
{
"epoch": 1.6423632935260843,
"grad_norm": 0.7136581540107727,
"learning_rate": 9.21568447701165e-05,
"loss": 0.3632,
"step": 7839
},
{
"epoch": 1.650534255185418,
"grad_norm": 0.5472484230995178,
"learning_rate": 9.206767856683674e-05,
"loss": 0.3906,
"step": 7878
},
{
"epoch": 1.6587052168447518,
"grad_norm": 0.7572371959686279,
"learning_rate": 9.19780520200143e-05,
"loss": 0.4074,
"step": 7917
},
{
"epoch": 1.6668761785040855,
"grad_norm": 0.7541080117225647,
"learning_rate": 9.1887966110426e-05,
"loss": 0.3769,
"step": 7956
},
{
"epoch": 1.6750471401634193,
"grad_norm": 0.7768887281417847,
"learning_rate": 9.179742182387538e-05,
"loss": 0.3887,
"step": 7995
},
{
"epoch": 1.683218101822753,
"grad_norm": 0.4816763699054718,
"learning_rate": 9.170642015118195e-05,
"loss": 0.3968,
"step": 8034
},
{
"epoch": 1.6913890634820867,
"grad_norm": 0.798069953918457,
"learning_rate": 9.16149620881704e-05,
"loss": 0.3929,
"step": 8073
},
{
"epoch": 1.6995600251414205,
"grad_norm": 0.6265507340431213,
"learning_rate": 9.152304863565965e-05,
"loss": 0.3509,
"step": 8112
},
{
"epoch": 1.7077309868007542,
"grad_norm": 0.7725851535797119,
"learning_rate": 9.143068079945191e-05,
"loss": 0.416,
"step": 8151
},
{
"epoch": 1.715901948460088,
"grad_norm": 0.7044919729232788,
"learning_rate": 9.133785959032172e-05,
"loss": 0.3901,
"step": 8190
},
{
"epoch": 1.7240729101194217,
"grad_norm": 0.7072455883026123,
"learning_rate": 9.124458602400476e-05,
"loss": 0.4162,
"step": 8229
},
{
"epoch": 1.7322438717787554,
"grad_norm": 0.6308450698852539,
"learning_rate": 9.11508611211869e-05,
"loss": 0.3617,
"step": 8268
},
{
"epoch": 1.7404148334380891,
"grad_norm": 0.9505504369735718,
"learning_rate": 9.105668590749292e-05,
"loss": 0.4235,
"step": 8307
},
{
"epoch": 1.7485857950974228,
"grad_norm": 0.6861995458602905,
"learning_rate": 9.096206141347533e-05,
"loss": 0.3757,
"step": 8346
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.7003109455108643,
"learning_rate": 9.086698867460306e-05,
"loss": 0.3672,
"step": 8385
},
{
"epoch": 1.7649277184160905,
"grad_norm": 0.7863865494728088,
"learning_rate": 9.07714687312502e-05,
"loss": 0.4294,
"step": 8424
},
{
"epoch": 1.7730986800754243,
"grad_norm": 0.7616212964057922,
"learning_rate": 9.067550262868449e-05,
"loss": 0.3681,
"step": 8463
},
{
"epoch": 1.781269641734758,
"grad_norm": 0.7139325737953186,
"learning_rate": 9.057909141705603e-05,
"loss": 0.4084,
"step": 8502
},
{
"epoch": 1.7894406033940917,
"grad_norm": 0.6723546981811523,
"learning_rate": 9.04822361513857e-05,
"loss": 0.4052,
"step": 8541
},
{
"epoch": 1.7976115650534257,
"grad_norm": 0.6281395554542542,
"learning_rate": 9.038493789155356e-05,
"loss": 0.3924,
"step": 8580
},
{
"epoch": 1.8057825267127594,
"grad_norm": 0.7400838732719421,
"learning_rate": 9.028719770228744e-05,
"loss": 0.4011,
"step": 8619
},
{
"epoch": 1.8139534883720931,
"grad_norm": 1.0410945415496826,
"learning_rate": 9.01890166531511e-05,
"loss": 0.385,
"step": 8658
},
{
"epoch": 1.8221244500314269,
"grad_norm": 0.7112125158309937,
"learning_rate": 9.009039581853259e-05,
"loss": 0.4204,
"step": 8697
},
{
"epoch": 1.8302954116907606,
"grad_norm": 0.667522668838501,
"learning_rate": 8.999133627763252e-05,
"loss": 0.4274,
"step": 8736
},
{
"epoch": 1.8384663733500943,
"grad_norm": 0.6372833251953125,
"learning_rate": 8.989183911445228e-05,
"loss": 0.3725,
"step": 8775
},
{
"epoch": 1.846637335009428,
"grad_norm": 0.8261222839355469,
"learning_rate": 8.979190541778199e-05,
"loss": 0.4256,
"step": 8814
},
{
"epoch": 1.8548082966687618,
"grad_norm": 0.6333631873130798,
"learning_rate": 8.969153628118891e-05,
"loss": 0.3836,
"step": 8853
},
{
"epoch": 1.8629792583280955,
"grad_norm": 0.6916419863700867,
"learning_rate": 8.959073280300514e-05,
"loss": 0.3756,
"step": 8892
},
{
"epoch": 1.8711502199874293,
"grad_norm": 0.5185075998306274,
"learning_rate": 8.948949608631578e-05,
"loss": 0.3772,
"step": 8931
},
{
"epoch": 1.879321181646763,
"grad_norm": 0.8508359789848328,
"learning_rate": 8.93878272389469e-05,
"loss": 0.42,
"step": 8970
},
{
"epoch": 1.8874921433060967,
"grad_norm": 0.6042287349700928,
"learning_rate": 8.928572737345328e-05,
"loss": 0.4111,
"step": 9009
},
{
"epoch": 1.8956631049654304,
"grad_norm": 0.8972792625427246,
"learning_rate": 8.918319760710629e-05,
"loss": 0.4178,
"step": 9048
},
{
"epoch": 1.9038340666247642,
"grad_norm": 0.7205092310905457,
"learning_rate": 8.90802390618817e-05,
"loss": 0.3947,
"step": 9087
},
{
"epoch": 1.912005028284098,
"grad_norm": 0.7933651804924011,
"learning_rate": 8.897685286444737e-05,
"loss": 0.3934,
"step": 9126
},
{
"epoch": 1.9201759899434316,
"grad_norm": 0.7529005408287048,
"learning_rate": 8.887304014615094e-05,
"loss": 0.3951,
"step": 9165
},
{
"epoch": 1.9283469516027656,
"grad_norm": 0.5970722436904907,
"learning_rate": 8.876880204300744e-05,
"loss": 0.3671,
"step": 9204
},
{
"epoch": 1.9365179132620993,
"grad_norm": 0.7347320914268494,
"learning_rate": 8.86641396956868e-05,
"loss": 0.4362,
"step": 9243
},
{
"epoch": 1.944688874921433,
"grad_norm": 0.6225306391716003,
"learning_rate": 8.855905424950149e-05,
"loss": 0.4039,
"step": 9282
},
{
"epoch": 1.9528598365807668,
"grad_norm": 0.7711159586906433,
"learning_rate": 8.845354685439388e-05,
"loss": 0.3766,
"step": 9321
},
{
"epoch": 1.9610307982401005,
"grad_norm": 0.7624640464782715,
"learning_rate": 8.834761866492373e-05,
"loss": 0.3854,
"step": 9360
},
{
"epoch": 1.9692017598994345,
"grad_norm": 0.8946365118026733,
"learning_rate": 8.824127084025551e-05,
"loss": 0.3675,
"step": 9399
},
{
"epoch": 1.9773727215587682,
"grad_norm": 0.5043982863426208,
"learning_rate": 8.813450454414567e-05,
"loss": 0.4078,
"step": 9438
},
{
"epoch": 1.985543683218102,
"grad_norm": 0.6283956170082092,
"learning_rate": 8.802732094493007e-05,
"loss": 0.382,
"step": 9477
},
{
"epoch": 1.9937146448774357,
"grad_norm": 0.6801990866661072,
"learning_rate": 8.7919721215511e-05,
"loss": 0.3883,
"step": 9516
},
{
"epoch": 2.0018856065367694,
"grad_norm": 0.6821087598800659,
"learning_rate": 8.781170653334445e-05,
"loss": 0.3598,
"step": 9555
},
{
"epoch": 2.010056568196103,
"grad_norm": 0.8372517824172974,
"learning_rate": 8.770327808042724e-05,
"loss": 0.3097,
"step": 9594
},
{
"epoch": 2.018227529855437,
"grad_norm": 0.5700801014900208,
"learning_rate": 8.759443704328405e-05,
"loss": 0.2982,
"step": 9633
},
{
"epoch": 2.0263984915147706,
"grad_norm": 0.7326349020004272,
"learning_rate": 8.748518461295438e-05,
"loss": 0.344,
"step": 9672
},
{
"epoch": 2.0345694531741043,
"grad_norm": 0.7985721230506897,
"learning_rate": 8.737552198497965e-05,
"loss": 0.3516,
"step": 9711
},
{
"epoch": 2.042740414833438,
"grad_norm": 0.8567628264427185,
"learning_rate": 8.726545035939e-05,
"loss": 0.3252,
"step": 9750
},
{
"epoch": 2.0509113764927718,
"grad_norm": 0.9082648158073425,
"learning_rate": 8.715497094069121e-05,
"loss": 0.3487,
"step": 9789
},
{
"epoch": 2.0590823381521055,
"grad_norm": 1.0632450580596924,
"learning_rate": 8.70440849378515e-05,
"loss": 0.3224,
"step": 9828
},
{
"epoch": 2.0672532998114392,
"grad_norm": 0.8348027467727661,
"learning_rate": 8.693279356428835e-05,
"loss": 0.313,
"step": 9867
},
{
"epoch": 2.075424261470773,
"grad_norm": 0.6354735493659973,
"learning_rate": 8.682109803785514e-05,
"loss": 0.3337,
"step": 9906
},
{
"epoch": 2.0835952231301067,
"grad_norm": 0.9293564558029175,
"learning_rate": 8.67089995808279e-05,
"loss": 0.3353,
"step": 9945
},
{
"epoch": 2.0917661847894404,
"grad_norm": 0.653914213180542,
"learning_rate": 8.659649941989186e-05,
"loss": 0.3348,
"step": 9984
},
{
"epoch": 2.0951183741881416,
"eval_accuracy": 0.009820309467613697,
"eval_loss": 0.4311712086200714,
"eval_runtime": 816.6202,
"eval_samples_per_second": 5.86,
"eval_steps_per_second": 1.466,
"step": 10000
},
{
"epoch": 2.099937146448774,
"grad_norm": 0.7376769781112671,
"learning_rate": 8.648359878612807e-05,
"loss": 0.3043,
"step": 10023
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.965570330619812,
"learning_rate": 8.637029891499997e-05,
"loss": 0.3619,
"step": 10062
},
{
"epoch": 2.116279069767442,
"grad_norm": 0.8820568919181824,
"learning_rate": 8.625660104633981e-05,
"loss": 0.3519,
"step": 10101
},
{
"epoch": 2.124450031426776,
"grad_norm": 0.9719869494438171,
"learning_rate": 8.614250642433506e-05,
"loss": 0.3524,
"step": 10140
},
{
"epoch": 2.1326209930861095,
"grad_norm": 0.8123674392700195,
"learning_rate": 8.602801629751486e-05,
"loss": 0.3324,
"step": 10179
},
{
"epoch": 2.1407919547454433,
"grad_norm": 0.4851992130279541,
"learning_rate": 8.591313191873634e-05,
"loss": 0.3331,
"step": 10218
},
{
"epoch": 2.148962916404777,
"grad_norm": 0.7514449954032898,
"learning_rate": 8.579785454517089e-05,
"loss": 0.3195,
"step": 10257
},
{
"epoch": 2.1571338780641107,
"grad_norm": 0.6087148785591125,
"learning_rate": 8.568218543829039e-05,
"loss": 0.3819,
"step": 10296
},
{
"epoch": 2.1653048397234445,
"grad_norm": 0.547778308391571,
"learning_rate": 8.556612586385349e-05,
"loss": 0.3231,
"step": 10335
},
{
"epoch": 2.173475801382778,
"grad_norm": 1.0253907442092896,
"learning_rate": 8.544967709189162e-05,
"loss": 0.3325,
"step": 10374
},
{
"epoch": 2.181646763042112,
"grad_norm": 0.7467255592346191,
"learning_rate": 8.533284039669524e-05,
"loss": 0.3224,
"step": 10413
},
{
"epoch": 2.1898177247014456,
"grad_norm": 0.8590918779373169,
"learning_rate": 8.52156170567998e-05,
"loss": 0.3293,
"step": 10452
},
{
"epoch": 2.1979886863607794,
"grad_norm": 0.6754146814346313,
"learning_rate": 8.509800835497175e-05,
"loss": 0.3227,
"step": 10491
},
{
"epoch": 2.206159648020113,
"grad_norm": 0.6803106069564819,
"learning_rate": 8.498001557819455e-05,
"loss": 0.3381,
"step": 10530
},
{
"epoch": 2.214330609679447,
"grad_norm": 1.0634793043136597,
"learning_rate": 8.486164001765457e-05,
"loss": 0.3659,
"step": 10569
},
{
"epoch": 2.2225015713387806,
"grad_norm": 0.6654180288314819,
"learning_rate": 8.474288296872695e-05,
"loss": 0.3359,
"step": 10608
},
{
"epoch": 2.2306725329981143,
"grad_norm": 0.9800730347633362,
"learning_rate": 8.462374573096143e-05,
"loss": 0.3483,
"step": 10647
},
{
"epoch": 2.238843494657448,
"grad_norm": 0.7170436382293701,
"learning_rate": 8.45042296080681e-05,
"loss": 0.3477,
"step": 10686
},
{
"epoch": 2.2470144563167818,
"grad_norm": 0.8500722646713257,
"learning_rate": 8.438433590790323e-05,
"loss": 0.3347,
"step": 10725
},
{
"epoch": 2.2551854179761155,
"grad_norm": 0.9178040623664856,
"learning_rate": 8.426406594245482e-05,
"loss": 0.3455,
"step": 10764
},
{
"epoch": 2.2633563796354492,
"grad_norm": 1.0374339818954468,
"learning_rate": 8.414342102782833e-05,
"loss": 0.3548,
"step": 10803
},
{
"epoch": 2.2715273412947834,
"grad_norm": 0.9963250160217285,
"learning_rate": 8.40224024842323e-05,
"loss": 0.3563,
"step": 10842
},
{
"epoch": 2.2796983029541167,
"grad_norm": 0.8895733952522278,
"learning_rate": 8.390101163596385e-05,
"loss": 0.3229,
"step": 10881
},
{
"epoch": 2.287869264613451,
"grad_norm": 0.7974137663841248,
"learning_rate": 8.377924981139413e-05,
"loss": 0.3201,
"step": 10920
},
{
"epoch": 2.2960402262727846,
"grad_norm": 0.8204047083854675,
"learning_rate": 8.3657118342954e-05,
"loss": 0.341,
"step": 10959
},
{
"epoch": 2.3042111879321183,
"grad_norm": 0.7952179312705994,
"learning_rate": 8.353461856711916e-05,
"loss": 0.3362,
"step": 10998
},
{
"epoch": 2.312382149591452,
"grad_norm": 0.6790579557418823,
"learning_rate": 8.341175182439577e-05,
"loss": 0.322,
"step": 11037
},
{
"epoch": 2.320553111250786,
"grad_norm": 0.7183639407157898,
"learning_rate": 8.328851945930563e-05,
"loss": 0.3193,
"step": 11076
},
{
"epoch": 2.3287240729101195,
"grad_norm": 0.7216742038726807,
"learning_rate": 8.316492282037154e-05,
"loss": 0.3319,
"step": 11115
},
{
"epoch": 2.3368950345694532,
"grad_norm": 0.7550281882286072,
"learning_rate": 8.30409632601025e-05,
"loss": 0.3249,
"step": 11154
},
{
"epoch": 2.345065996228787,
"grad_norm": 0.9077494740486145,
"learning_rate": 8.291664213497901e-05,
"loss": 0.3097,
"step": 11193
},
{
"epoch": 2.3532369578881207,
"grad_norm": 0.793636679649353,
"learning_rate": 8.279196080543803e-05,
"loss": 0.3515,
"step": 11232
},
{
"epoch": 2.3614079195474544,
"grad_norm": 0.8357187509536743,
"learning_rate": 8.266692063585828e-05,
"loss": 0.3516,
"step": 11271
},
{
"epoch": 2.369578881206788,
"grad_norm": 0.9153121709823608,
"learning_rate": 8.254152299454522e-05,
"loss": 0.3579,
"step": 11310
},
{
"epoch": 2.377749842866122,
"grad_norm": 0.671792209148407,
"learning_rate": 8.241576925371615e-05,
"loss": 0.3215,
"step": 11349
},
{
"epoch": 2.3859208045254556,
"grad_norm": 0.7391782999038696,
"learning_rate": 8.228966078948503e-05,
"loss": 0.3572,
"step": 11388
},
{
"epoch": 2.3940917661847894,
"grad_norm": 0.5811970829963684,
"learning_rate": 8.216319898184766e-05,
"loss": 0.3054,
"step": 11427
},
{
"epoch": 2.402262727844123,
"grad_norm": 0.5288280248641968,
"learning_rate": 8.203638521466637e-05,
"loss": 0.3348,
"step": 11466
},
{
"epoch": 2.410433689503457,
"grad_norm": 0.8727484941482544,
"learning_rate": 8.190922087565496e-05,
"loss": 0.3514,
"step": 11505
},
{
"epoch": 2.4186046511627906,
"grad_norm": 0.8370525240898132,
"learning_rate": 8.178170735636354e-05,
"loss": 0.3155,
"step": 11544
},
{
"epoch": 2.4267756128221243,
"grad_norm": 0.6996369361877441,
"learning_rate": 8.165384605216329e-05,
"loss": 0.3474,
"step": 11583
},
{
"epoch": 2.434946574481458,
"grad_norm": 0.6904685497283936,
"learning_rate": 8.152563836223111e-05,
"loss": 0.3453,
"step": 11622
},
{
"epoch": 2.443117536140792,
"grad_norm": 0.7102589011192322,
"learning_rate": 8.139708568953444e-05,
"loss": 0.337,
"step": 11661
},
{
"epoch": 2.4512884978001255,
"grad_norm": 0.7969030737876892,
"learning_rate": 8.12681894408158e-05,
"loss": 0.3213,
"step": 11700
},
{
"epoch": 2.4594594594594597,
"grad_norm": 0.961298942565918,
"learning_rate": 8.113895102657744e-05,
"loss": 0.3386,
"step": 11739
},
{
"epoch": 2.4676304211187934,
"grad_norm": 0.8089680075645447,
"learning_rate": 8.100937186106596e-05,
"loss": 0.3091,
"step": 11778
},
{
"epoch": 2.475801382778127,
"grad_norm": 1.0000693798065186,
"learning_rate": 8.087945336225668e-05,
"loss": 0.3335,
"step": 11817
},
{
"epoch": 2.483972344437461,
"grad_norm": 0.6780822277069092,
"learning_rate": 8.074919695183831e-05,
"loss": 0.3306,
"step": 11856
},
{
"epoch": 2.4921433060967946,
"grad_norm": 0.8540327548980713,
"learning_rate": 8.061860405519724e-05,
"loss": 0.3527,
"step": 11895
},
{
"epoch": 2.5003142677561283,
"grad_norm": 0.8583691120147705,
"learning_rate": 8.048767610140204e-05,
"loss": 0.3257,
"step": 11934
},
{
"epoch": 2.508485229415462,
"grad_norm": 0.6268760561943054,
"learning_rate": 8.035641452318775e-05,
"loss": 0.2978,
"step": 11973
},
{
"epoch": 2.5166561910747958,
"grad_norm": 0.9798884391784668,
"learning_rate": 8.022482075694027e-05,
"loss": 0.3297,
"step": 12012
},
{
"epoch": 2.5248271527341295,
"grad_norm": 0.8267270922660828,
"learning_rate": 8.009289624268062e-05,
"loss": 0.3509,
"step": 12051
},
{
"epoch": 2.5329981143934632,
"grad_norm": 0.6128790974617004,
"learning_rate": 7.996064242404912e-05,
"loss": 0.3134,
"step": 12090
},
{
"epoch": 2.541169076052797,
"grad_norm": 0.8551040887832642,
"learning_rate": 7.98280607482897e-05,
"loss": 0.3563,
"step": 12129
},
{
"epoch": 2.5493400377121307,
"grad_norm": 0.9739424586296082,
"learning_rate": 7.969515266623396e-05,
"loss": 0.3468,
"step": 12168
},
{
"epoch": 2.5575109993714644,
"grad_norm": 0.6370837092399597,
"learning_rate": 7.956191963228538e-05,
"loss": 0.3462,
"step": 12207
},
{
"epoch": 2.565681961030798,
"grad_norm": 0.6456401944160461,
"learning_rate": 7.942836310440334e-05,
"loss": 0.3266,
"step": 12246
},
{
"epoch": 2.573852922690132,
"grad_norm": 0.8971288800239563,
"learning_rate": 7.929448454408719e-05,
"loss": 0.3292,
"step": 12285
},
{
"epoch": 2.5820238843494656,
"grad_norm": 0.8512323498725891,
"learning_rate": 7.916028541636027e-05,
"loss": 0.3402,
"step": 12324
},
{
"epoch": 2.5901948460087993,
"grad_norm": 0.7138497233390808,
"learning_rate": 7.902576718975387e-05,
"loss": 0.3658,
"step": 12363
},
{
"epoch": 2.5983658076681335,
"grad_norm": 0.9771799445152283,
"learning_rate": 7.889093133629115e-05,
"loss": 0.312,
"step": 12402
},
{
"epoch": 2.606536769327467,
"grad_norm": 0.8607495427131653,
"learning_rate": 7.875577933147101e-05,
"loss": 0.3289,
"step": 12441
},
{
"epoch": 2.614707730986801,
"grad_norm": 0.9107287526130676,
"learning_rate": 7.8620312654252e-05,
"loss": 0.3435,
"step": 12480
},
{
"epoch": 2.6228786926461343,
"grad_norm": 0.7893658876419067,
"learning_rate": 7.848453278703613e-05,
"loss": 0.3325,
"step": 12519
},
{
"epoch": 2.6310496543054684,
"grad_norm": 0.6861433982849121,
"learning_rate": 7.834844121565257e-05,
"loss": 0.3185,
"step": 12558
},
{
"epoch": 2.639220615964802,
"grad_norm": 0.6425657272338867,
"learning_rate": 7.821203942934148e-05,
"loss": 0.3314,
"step": 12597
},
{
"epoch": 2.647391577624136,
"grad_norm": 0.6099838018417358,
"learning_rate": 7.807532892073768e-05,
"loss": 0.3302,
"step": 12636
},
{
"epoch": 2.6555625392834696,
"grad_norm": 0.8393537402153015,
"learning_rate": 7.793831118585429e-05,
"loss": 0.3088,
"step": 12675
},
{
"epoch": 2.6637335009428034,
"grad_norm": 0.752255916595459,
"learning_rate": 7.780098772406643e-05,
"loss": 0.3205,
"step": 12714
},
{
"epoch": 2.671904462602137,
"grad_norm": 0.7880775928497314,
"learning_rate": 7.766336003809472e-05,
"loss": 0.3306,
"step": 12753
},
{
"epoch": 2.680075424261471,
"grad_norm": 0.708419919013977,
"learning_rate": 7.752542963398892e-05,
"loss": 0.3554,
"step": 12792
},
{
"epoch": 2.6882463859208046,
"grad_norm": 0.8341570496559143,
"learning_rate": 7.738719802111139e-05,
"loss": 0.3145,
"step": 12831
},
{
"epoch": 2.6964173475801383,
"grad_norm": 0.703484058380127,
"learning_rate": 7.724866671212059e-05,
"loss": 0.3284,
"step": 12870
},
{
"epoch": 2.704588309239472,
"grad_norm": 0.7732682228088379,
"learning_rate": 7.710983722295455e-05,
"loss": 0.3376,
"step": 12909
},
{
"epoch": 2.7127592708988058,
"grad_norm": 0.7687979340553284,
"learning_rate": 7.697071107281428e-05,
"loss": 0.3418,
"step": 12948
},
{
"epoch": 2.7209302325581395,
"grad_norm": 0.6588707566261292,
"learning_rate": 7.683128978414707e-05,
"loss": 0.3352,
"step": 12987
},
{
"epoch": 2.729101194217473,
"grad_norm": 1.0603809356689453,
"learning_rate": 7.669157488262997e-05,
"loss": 0.3409,
"step": 13026
},
{
"epoch": 2.737272155876807,
"grad_norm": 0.603203296661377,
"learning_rate": 7.655156789715295e-05,
"loss": 0.3578,
"step": 13065
},
{
"epoch": 2.7454431175361407,
"grad_norm": 0.9588086605072021,
"learning_rate": 7.641127035980222e-05,
"loss": 0.3463,
"step": 13104
},
{
"epoch": 2.7536140791954744,
"grad_norm": 0.7366406917572021,
"learning_rate": 7.627068380584359e-05,
"loss": 0.3262,
"step": 13143
},
{
"epoch": 2.761785040854808,
"grad_norm": 0.7737981677055359,
"learning_rate": 7.612980977370542e-05,
"loss": 0.3463,
"step": 13182
},
{
"epoch": 2.7699560025141423,
"grad_norm": 0.7862906455993652,
"learning_rate": 7.5988649804962e-05,
"loss": 0.3257,
"step": 13221
},
{
"epoch": 2.7781269641734756,
"grad_norm": 0.8357959389686584,
"learning_rate": 7.584720544431661e-05,
"loss": 0.3365,
"step": 13260
},
{
"epoch": 2.7862979258328098,
"grad_norm": 0.8603371381759644,
"learning_rate": 7.570547823958454e-05,
"loss": 0.3296,
"step": 13299
},
{
"epoch": 2.794468887492143,
"grad_norm": 1.0153621435165405,
"learning_rate": 7.55634697416763e-05,
"loss": 0.3621,
"step": 13338
},
{
"epoch": 2.8026398491514772,
"grad_norm": 0.6060919761657715,
"learning_rate": 7.542118150458054e-05,
"loss": 0.3263,
"step": 13377
},
{
"epoch": 2.810810810810811,
"grad_norm": 0.6489693522453308,
"learning_rate": 7.527861508534706e-05,
"loss": 0.3632,
"step": 13416
},
{
"epoch": 2.8189817724701447,
"grad_norm": 0.7105704545974731,
"learning_rate": 7.513577204406985e-05,
"loss": 0.3522,
"step": 13455
},
{
"epoch": 2.8271527341294784,
"grad_norm": 0.7109383940696716,
"learning_rate": 7.499265394386983e-05,
"loss": 0.3462,
"step": 13494
},
{
"epoch": 2.835323695788812,
"grad_norm": 0.7103734016418457,
"learning_rate": 7.484926235087799e-05,
"loss": 0.3559,
"step": 13533
},
{
"epoch": 2.843494657448146,
"grad_norm": 0.6860262155532837,
"learning_rate": 7.470559883421809e-05,
"loss": 0.3576,
"step": 13572
},
{
"epoch": 2.8516656191074796,
"grad_norm": 0.7737835645675659,
"learning_rate": 7.456166496598953e-05,
"loss": 0.3712,
"step": 13611
},
{
"epoch": 2.8598365807668134,
"grad_norm": 0.8572138547897339,
"learning_rate": 7.441746232125013e-05,
"loss": 0.3201,
"step": 13650
},
{
"epoch": 2.868007542426147,
"grad_norm": 0.9748623967170715,
"learning_rate": 7.427299247799895e-05,
"loss": 0.3563,
"step": 13689
},
{
"epoch": 2.876178504085481,
"grad_norm": 0.8315281867980957,
"learning_rate": 7.412825701715893e-05,
"loss": 0.3346,
"step": 13728
},
{
"epoch": 2.8843494657448145,
"grad_norm": 0.890327513217926,
"learning_rate": 7.398325752255973e-05,
"loss": 0.3337,
"step": 13767
},
{
"epoch": 2.8925204274041483,
"grad_norm": 0.6312198638916016,
"learning_rate": 7.38379955809202e-05,
"loss": 0.337,
"step": 13806
},
{
"epoch": 2.900691389063482,
"grad_norm": 0.6604064702987671,
"learning_rate": 7.369247278183123e-05,
"loss": 0.3397,
"step": 13845
},
{
"epoch": 2.9088623507228157,
"grad_norm": 0.5490432381629944,
"learning_rate": 7.35466907177382e-05,
"loss": 0.326,
"step": 13884
},
{
"epoch": 2.9170333123821495,
"grad_norm": 0.9371681809425354,
"learning_rate": 7.340065098392361e-05,
"loss": 0.3435,
"step": 13923
},
{
"epoch": 2.925204274041483,
"grad_norm": 0.8379968404769897,
"learning_rate": 7.325435517848963e-05,
"loss": 0.3321,
"step": 13962
},
{
"epoch": 2.933375235700817,
"grad_norm": 0.3769962191581726,
"learning_rate": 7.310780490234061e-05,
"loss": 0.3707,
"step": 14001
},
{
"epoch": 2.941546197360151,
"grad_norm": 0.659055233001709,
"learning_rate": 7.296100175916556e-05,
"loss": 0.331,
"step": 14040
},
{
"epoch": 2.9497171590194844,
"grad_norm": 0.840793251991272,
"learning_rate": 7.281394735542056e-05,
"loss": 0.3382,
"step": 14079
},
{
"epoch": 2.9578881206788186,
"grad_norm": 0.8272313475608826,
"learning_rate": 7.266664330031128e-05,
"loss": 0.316,
"step": 14118
},
{
"epoch": 2.966059082338152,
"grad_norm": 0.9367392063140869,
"learning_rate": 7.25190912057752e-05,
"loss": 0.336,
"step": 14157
},
{
"epoch": 2.974230043997486,
"grad_norm": 0.6122297048568726,
"learning_rate": 7.237129268646419e-05,
"loss": 0.323,
"step": 14196
},
{
"epoch": 2.9824010056568198,
"grad_norm": 0.7298004031181335,
"learning_rate": 7.22232493597267e-05,
"loss": 0.3361,
"step": 14235
},
{
"epoch": 2.9905719673161535,
"grad_norm": 0.7116037607192993,
"learning_rate": 7.207496284559003e-05,
"loss": 0.3035,
"step": 14274
},
{
"epoch": 2.998742928975487,
"grad_norm": 0.886817216873169,
"learning_rate": 7.192643476674272e-05,
"loss": 0.3156,
"step": 14313
},
{
"epoch": 3.006913890634821,
"grad_norm": 0.7054494619369507,
"learning_rate": 7.177766674851674e-05,
"loss": 0.2916,
"step": 14352
},
{
"epoch": 3.0150848522941547,
"grad_norm": 0.6415011882781982,
"learning_rate": 7.162866041886963e-05,
"loss": 0.2485,
"step": 14391
},
{
"epoch": 3.0232558139534884,
"grad_norm": 0.9112984538078308,
"learning_rate": 7.147941740836686e-05,
"loss": 0.2598,
"step": 14430
},
{
"epoch": 3.031426775612822,
"grad_norm": 0.9063923358917236,
"learning_rate": 7.132993935016377e-05,
"loss": 0.255,
"step": 14469
},
{
"epoch": 3.039597737272156,
"grad_norm": 0.897348165512085,
"learning_rate": 7.118022787998788e-05,
"loss": 0.2622,
"step": 14508
},
{
"epoch": 3.0477686989314896,
"grad_norm": 0.9947516322135925,
"learning_rate": 7.103028463612094e-05,
"loss": 0.2539,
"step": 14547
},
{
"epoch": 3.0559396605908233,
"grad_norm": 0.9619819521903992,
"learning_rate": 7.088011125938091e-05,
"loss": 0.2546,
"step": 14586
},
{
"epoch": 3.064110622250157,
"grad_norm": 0.9941222071647644,
"learning_rate": 7.072970939310412e-05,
"loss": 0.2605,
"step": 14625
},
{
"epoch": 3.072281583909491,
"grad_norm": 0.8076801896095276,
"learning_rate": 7.057908068312726e-05,
"loss": 0.2748,
"step": 14664
},
{
"epoch": 3.0804525455688245,
"grad_norm": 0.9070518016815186,
"learning_rate": 7.042822677776929e-05,
"loss": 0.26,
"step": 14703
},
{
"epoch": 3.0886235072281583,
"grad_norm": 1.0293809175491333,
"learning_rate": 7.027714932781355e-05,
"loss": 0.2703,
"step": 14742
},
{
"epoch": 3.096794468887492,
"grad_norm": 0.982684850692749,
"learning_rate": 7.012584998648956e-05,
"loss": 0.2693,
"step": 14781
},
{
"epoch": 3.1049654305468257,
"grad_norm": 1.306250810623169,
"learning_rate": 6.997433040945498e-05,
"loss": 0.2772,
"step": 14820
},
{
"epoch": 3.1131363922061595,
"grad_norm": 0.8497562408447266,
"learning_rate": 6.982259225477753e-05,
"loss": 0.278,
"step": 14859
},
{
"epoch": 3.121307353865493,
"grad_norm": 0.9325224161148071,
"learning_rate": 6.967063718291673e-05,
"loss": 0.2726,
"step": 14898
},
{
"epoch": 3.1294783155248274,
"grad_norm": 0.9601906538009644,
"learning_rate": 6.951846685670594e-05,
"loss": 0.2573,
"step": 14937
},
{
"epoch": 3.137649277184161,
"grad_norm": 0.8388446569442749,
"learning_rate": 6.936608294133391e-05,
"loss": 0.2719,
"step": 14976
},
{
"epoch": 3.145820238843495,
"grad_norm": 0.7028934359550476,
"learning_rate": 6.921348710432675e-05,
"loss": 0.2539,
"step": 15015
},
{
"epoch": 3.1539912005028286,
"grad_norm": 0.8849790692329407,
"learning_rate": 6.906068101552957e-05,
"loss": 0.2436,
"step": 15054
},
{
"epoch": 3.1621621621621623,
"grad_norm": 0.7171310186386108,
"learning_rate": 6.890766634708826e-05,
"loss": 0.26,
"step": 15093
},
{
"epoch": 3.170333123821496,
"grad_norm": 0.8107304573059082,
"learning_rate": 6.875444477343123e-05,
"loss": 0.2505,
"step": 15132
},
{
"epoch": 3.1785040854808297,
"grad_norm": 1.0732225179672241,
"learning_rate": 6.860101797125098e-05,
"loss": 0.2418,
"step": 15171
},
{
"epoch": 3.1866750471401635,
"grad_norm": 1.0856764316558838,
"learning_rate": 6.844738761948584e-05,
"loss": 0.2585,
"step": 15210
},
{
"epoch": 3.194846008799497,
"grad_norm": 1.0335191488265991,
"learning_rate": 6.829355539930156e-05,
"loss": 0.2838,
"step": 15249
},
{
"epoch": 3.203016970458831,
"grad_norm": 0.895849883556366,
"learning_rate": 6.81395229940729e-05,
"loss": 0.2619,
"step": 15288
},
{
"epoch": 3.2111879321181647,
"grad_norm": 1.054457426071167,
"learning_rate": 6.798529208936528e-05,
"loss": 0.2867,
"step": 15327
},
{
"epoch": 3.2193588937774984,
"grad_norm": 1.171610951423645,
"learning_rate": 6.783086437291623e-05,
"loss": 0.2718,
"step": 15366
},
{
"epoch": 3.227529855436832,
"grad_norm": 1.4226104021072388,
"learning_rate": 6.767624153461701e-05,
"loss": 0.2584,
"step": 15405
},
{
"epoch": 3.235700817096166,
"grad_norm": 0.7325811386108398,
"learning_rate": 6.75214252664941e-05,
"loss": 0.2572,
"step": 15444
},
{
"epoch": 3.2438717787554996,
"grad_norm": 1.8367798328399658,
"learning_rate": 6.736641726269065e-05,
"loss": 0.2383,
"step": 15483
},
{
"epoch": 3.2520427404148333,
"grad_norm": 1.2098209857940674,
"learning_rate": 6.721121921944791e-05,
"loss": 0.2668,
"step": 15522
},
{
"epoch": 3.260213702074167,
"grad_norm": 0.9780440330505371,
"learning_rate": 6.70558328350868e-05,
"loss": 0.2375,
"step": 15561
},
{
"epoch": 3.268384663733501,
"grad_norm": 0.8904445767402649,
"learning_rate": 6.69002598099892e-05,
"loss": 0.2525,
"step": 15600
},
{
"epoch": 3.2765556253928345,
"grad_norm": 0.9340187311172485,
"learning_rate": 6.674450184657939e-05,
"loss": 0.2691,
"step": 15639
},
{
"epoch": 3.2847265870521687,
"grad_norm": 0.9010167717933655,
"learning_rate": 6.658856064930542e-05,
"loss": 0.258,
"step": 15678
},
{
"epoch": 3.292897548711502,
"grad_norm": 1.3005743026733398,
"learning_rate": 6.643243792462047e-05,
"loss": 0.2713,
"step": 15717
},
{
"epoch": 3.301068510370836,
"grad_norm": 0.9360527396202087,
"learning_rate": 6.627613538096412e-05,
"loss": 0.2615,
"step": 15756
},
{
"epoch": 3.30923947203017,
"grad_norm": 0.8945103287696838,
"learning_rate": 6.611965472874371e-05,
"loss": 0.2698,
"step": 15795
},
{
"epoch": 3.3174104336895036,
"grad_norm": 0.8402488827705383,
"learning_rate": 6.596299768031567e-05,
"loss": 0.2723,
"step": 15834
},
{
"epoch": 3.3255813953488373,
"grad_norm": 0.8912140130996704,
"learning_rate": 6.580616594996663e-05,
"loss": 0.2754,
"step": 15873
},
{
"epoch": 3.333752357008171,
"grad_norm": 0.9579502940177917,
"learning_rate": 6.564916125389482e-05,
"loss": 0.2474,
"step": 15912
},
{
"epoch": 3.341923318667505,
"grad_norm": 1.2083587646484375,
"learning_rate": 6.549198531019116e-05,
"loss": 0.2546,
"step": 15951
},
{
"epoch": 3.3500942803268385,
"grad_norm": 1.0274497270584106,
"learning_rate": 6.533463983882059e-05,
"loss": 0.2659,
"step": 15990
},
{
"epoch": 3.3582652419861723,
"grad_norm": 0.9805575609207153,
"learning_rate": 6.517712656160313e-05,
"loss": 0.2639,
"step": 16029
},
{
"epoch": 3.366436203645506,
"grad_norm": 1.1635912656784058,
"learning_rate": 6.501944720219508e-05,
"loss": 0.2805,
"step": 16068
},
{
"epoch": 3.3746071653048397,
"grad_norm": 1.2201091051101685,
"learning_rate": 6.486160348607023e-05,
"loss": 0.2581,
"step": 16107
},
{
"epoch": 3.3827781269641735,
"grad_norm": 0.8909711837768555,
"learning_rate": 6.470359714050083e-05,
"loss": 0.2699,
"step": 16146
},
{
"epoch": 3.390949088623507,
"grad_norm": 1.3544241189956665,
"learning_rate": 6.454542989453882e-05,
"loss": 0.2694,
"step": 16185
},
{
"epoch": 3.399120050282841,
"grad_norm": 0.7139955163002014,
"learning_rate": 6.438710347899687e-05,
"loss": 0.2752,
"step": 16224
},
{
"epoch": 3.4072910119421747,
"grad_norm": 0.9686264991760254,
"learning_rate": 6.422861962642938e-05,
"loss": 0.2614,
"step": 16263
},
{
"epoch": 3.4154619736015084,
"grad_norm": 0.9705599546432495,
"learning_rate": 6.406998007111365e-05,
"loss": 0.2515,
"step": 16302
},
{
"epoch": 3.423632935260842,
"grad_norm": 0.5927676558494568,
"learning_rate": 6.391118654903074e-05,
"loss": 0.2638,
"step": 16341
},
{
"epoch": 3.431803896920176,
"grad_norm": 1.0339412689208984,
"learning_rate": 6.375224079784662e-05,
"loss": 0.281,
"step": 16380
},
{
"epoch": 3.4399748585795096,
"grad_norm": 1.2015362977981567,
"learning_rate": 6.359314455689308e-05,
"loss": 0.2517,
"step": 16419
},
{
"epoch": 3.4481458202388433,
"grad_norm": 0.8283564448356628,
"learning_rate": 6.343389956714866e-05,
"loss": 0.2675,
"step": 16458
},
{
"epoch": 3.4563167818981775,
"grad_norm": 0.9697973132133484,
"learning_rate": 6.32745075712197e-05,
"loss": 0.2725,
"step": 16497
},
{
"epoch": 3.4644877435575108,
"grad_norm": 0.8286094665527344,
"learning_rate": 6.311497031332122e-05,
"loss": 0.2689,
"step": 16536
},
{
"epoch": 3.472658705216845,
"grad_norm": 0.9562814235687256,
"learning_rate": 6.29552895392578e-05,
"loss": 0.2619,
"step": 16575
},
{
"epoch": 3.4808296668761787,
"grad_norm": 0.840769350528717,
"learning_rate": 6.279546699640452e-05,
"loss": 0.2574,
"step": 16614
},
{
"epoch": 3.4890006285355124,
"grad_norm": 0.820468008518219,
"learning_rate": 6.263550443368783e-05,
"loss": 0.2482,
"step": 16653
},
{
"epoch": 3.497171590194846,
"grad_norm": 0.8476835489273071,
"learning_rate": 6.247540360156638e-05,
"loss": 0.2823,
"step": 16692
},
{
"epoch": 3.50534255185418,
"grad_norm": 1.0869783163070679,
"learning_rate": 6.231516625201196e-05,
"loss": 0.266,
"step": 16731
},
{
"epoch": 3.5135135135135136,
"grad_norm": 1.0051486492156982,
"learning_rate": 6.215479413849019e-05,
"loss": 0.2489,
"step": 16770
},
{
"epoch": 3.5216844751728473,
"grad_norm": 1.0071632862091064,
"learning_rate": 6.199428901594142e-05,
"loss": 0.2776,
"step": 16809
},
{
"epoch": 3.529855436832181,
"grad_norm": 1.0343513488769531,
"learning_rate": 6.183365264076152e-05,
"loss": 0.2811,
"step": 16848
},
{
"epoch": 3.538026398491515,
"grad_norm": 1.3418666124343872,
"learning_rate": 6.167288677078266e-05,
"loss": 0.2581,
"step": 16887
},
{
"epoch": 3.5461973601508485,
"grad_norm": 1.228233814239502,
"learning_rate": 6.151199316525403e-05,
"loss": 0.3023,
"step": 16926
},
{
"epoch": 3.5543683218101823,
"grad_norm": 1.132449746131897,
"learning_rate": 6.135097358482265e-05,
"loss": 0.2763,
"step": 16965
},
{
"epoch": 3.562539283469516,
"grad_norm": 0.8154187202453613,
"learning_rate": 6.118982979151405e-05,
"loss": 0.2607,
"step": 17004
},
{
"epoch": 3.5707102451288497,
"grad_norm": 0.7642055749893188,
"learning_rate": 6.102856354871304e-05,
"loss": 0.2704,
"step": 17043
},
{
"epoch": 3.5788812067881834,
"grad_norm": 0.7441285252571106,
"learning_rate": 6.086717662114434e-05,
"loss": 0.2704,
"step": 17082
},
{
"epoch": 3.587052168447517,
"grad_norm": 1.1328179836273193,
"learning_rate": 6.0705670774853375e-05,
"loss": 0.2734,
"step": 17121
},
{
"epoch": 3.595223130106851,
"grad_norm": 1.2895148992538452,
"learning_rate": 6.054404777718683e-05,
"loss": 0.2534,
"step": 17160
},
{
"epoch": 3.6033940917661846,
"grad_norm": 1.1969038248062134,
"learning_rate": 6.0382309396773405e-05,
"loss": 0.288,
"step": 17199
},
{
"epoch": 3.611565053425519,
"grad_norm": 1.0326099395751953,
"learning_rate": 6.022045740350444e-05,
"loss": 0.273,
"step": 17238
},
{
"epoch": 3.619736015084852,
"grad_norm": 0.842157244682312,
"learning_rate": 6.005849356851448e-05,
"loss": 0.2757,
"step": 17277
},
{
"epoch": 3.6279069767441863,
"grad_norm": 1.004172444343567,
"learning_rate": 5.989641966416201e-05,
"loss": 0.2719,
"step": 17316
},
{
"epoch": 3.6360779384035196,
"grad_norm": 1.0707955360412598,
"learning_rate": 5.973423746400991e-05,
"loss": 0.2785,
"step": 17355
},
{
"epoch": 3.6442489000628537,
"grad_norm": 0.9679854512214661,
"learning_rate": 5.957194874280623e-05,
"loss": 0.2628,
"step": 17394
},
{
"epoch": 3.6524198617221875,
"grad_norm": 1.2920995950698853,
"learning_rate": 5.940955527646461e-05,
"loss": 0.2803,
"step": 17433
},
{
"epoch": 3.660590823381521,
"grad_norm": 1.1548652648925781,
"learning_rate": 5.924705884204491e-05,
"loss": 0.2827,
"step": 17472
},
{
"epoch": 3.668761785040855,
"grad_norm": 1.4897462129592896,
"learning_rate": 5.908446121773381e-05,
"loss": 0.2883,
"step": 17511
},
{
"epoch": 3.6769327467001887,
"grad_norm": 1.005393147468567,
"learning_rate": 5.892176418282522e-05,
"loss": 0.2635,
"step": 17550
},
{
"epoch": 3.6851037083595224,
"grad_norm": 1.1993404626846313,
"learning_rate": 5.8758969517701e-05,
"loss": 0.2786,
"step": 17589
},
{
"epoch": 3.693274670018856,
"grad_norm": 1.0545886754989624,
"learning_rate": 5.859607900381129e-05,
"loss": 0.2567,
"step": 17628
},
{
"epoch": 3.70144563167819,
"grad_norm": 0.632892906665802,
"learning_rate": 5.84330944236551e-05,
"loss": 0.2755,
"step": 17667
},
{
"epoch": 3.7096165933375236,
"grad_norm": 1.0235896110534668,
"learning_rate": 5.8270017560760845e-05,
"loss": 0.2709,
"step": 17706
},
{
"epoch": 3.7177875549968573,
"grad_norm": 1.0493903160095215,
"learning_rate": 5.8106850199666754e-05,
"loss": 0.2707,
"step": 17745
},
{
"epoch": 3.725958516656191,
"grad_norm": 1.1471383571624756,
"learning_rate": 5.794359412590136e-05,
"loss": 0.257,
"step": 17784
},
{
"epoch": 3.7341294783155248,
"grad_norm": 1.0675766468048096,
"learning_rate": 5.778025112596401e-05,
"loss": 0.2665,
"step": 17823
},
{
"epoch": 3.7423004399748585,
"grad_norm": 1.0171725749969482,
"learning_rate": 5.761682298730524e-05,
"loss": 0.27,
"step": 17862
},
{
"epoch": 3.7504714016341922,
"grad_norm": 1.209671139717102,
"learning_rate": 5.745331149830729e-05,
"loss": 0.2723,
"step": 17901
},
{
"epoch": 3.758642363293526,
"grad_norm": 1.1893844604492188,
"learning_rate": 5.728971844826445e-05,
"loss": 0.2835,
"step": 17940
},
{
"epoch": 3.7668133249528597,
"grad_norm": 1.2675912380218506,
"learning_rate": 5.7126045627363556e-05,
"loss": 0.2979,
"step": 17979
},
{
"epoch": 3.7749842866121934,
"grad_norm": 1.090598464012146,
"learning_rate": 5.6962294826664385e-05,
"loss": 0.2748,
"step": 18018
},
{
"epoch": 3.7831552482715276,
"grad_norm": 1.0700677633285522,
"learning_rate": 5.679846783808e-05,
"loss": 0.3043,
"step": 18057
},
{
"epoch": 3.791326209930861,
"grad_norm": 1.0571792125701904,
"learning_rate": 5.6634566454357196e-05,
"loss": 0.2589,
"step": 18096
},
{
"epoch": 3.799497171590195,
"grad_norm": 0.8858397006988525,
"learning_rate": 5.6470592469056915e-05,
"loss": 0.2976,
"step": 18135
},
{
"epoch": 3.8076681332495284,
"grad_norm": 1.1942354440689087,
"learning_rate": 5.6306547676534514e-05,
"loss": 0.2742,
"step": 18174
},
{
"epoch": 3.8158390949088625,
"grad_norm": 1.4405794143676758,
"learning_rate": 5.614243387192022e-05,
"loss": 0.3013,
"step": 18213
},
{
"epoch": 3.8240100565681963,
"grad_norm": 0.9287608861923218,
"learning_rate": 5.5978252851099425e-05,
"loss": 0.2542,
"step": 18252
},
{
"epoch": 3.83218101822753,
"grad_norm": 1.217387080192566,
"learning_rate": 5.581400641069309e-05,
"loss": 0.2768,
"step": 18291
},
{
"epoch": 3.8403519798868637,
"grad_norm": 1.1811288595199585,
"learning_rate": 5.564969634803806e-05,
"loss": 0.263,
"step": 18330
},
{
"epoch": 3.8485229415461975,
"grad_norm": 0.9432389140129089,
"learning_rate": 5.548532446116737e-05,
"loss": 0.2612,
"step": 18369
},
{
"epoch": 3.856693903205531,
"grad_norm": 1.010432243347168,
"learning_rate": 5.532089254879061e-05,
"loss": 0.2505,
"step": 18408
},
{
"epoch": 3.864864864864865,
"grad_norm": 1.1816331148147583,
"learning_rate": 5.515640241027423e-05,
"loss": 0.282,
"step": 18447
},
{
"epoch": 3.8730358265241986,
"grad_norm": 1.011232614517212,
"learning_rate": 5.499185584562183e-05,
"loss": 0.2894,
"step": 18486
},
{
"epoch": 3.8812067881835324,
"grad_norm": 0.7370874881744385,
"learning_rate": 5.482725465545449e-05,
"loss": 0.2692,
"step": 18525
},
{
"epoch": 3.889377749842866,
"grad_norm": 0.831798791885376,
"learning_rate": 5.4662600640991025e-05,
"loss": 0.2984,
"step": 18564
},
{
"epoch": 3.8975487115022,
"grad_norm": 0.7684634923934937,
"learning_rate": 5.4497895604028334e-05,
"loss": 0.282,
"step": 18603
},
{
"epoch": 3.9057196731615336,
"grad_norm": 1.2753115892410278,
"learning_rate": 5.4333141346921644e-05,
"loss": 0.2692,
"step": 18642
},
{
"epoch": 3.9138906348208673,
"grad_norm": 0.9664742350578308,
"learning_rate": 5.4168339672564795e-05,
"loss": 0.2683,
"step": 18681
},
{
"epoch": 3.922061596480201,
"grad_norm": 1.0317939519882202,
"learning_rate": 5.4003492384370504e-05,
"loss": 0.2635,
"step": 18720
},
{
"epoch": 3.9302325581395348,
"grad_norm": 0.88239985704422,
"learning_rate": 5.383860128625062e-05,
"loss": 0.2631,
"step": 18759
},
{
"epoch": 3.9384035197988685,
"grad_norm": 1.137979507446289,
"learning_rate": 5.367366818259646e-05,
"loss": 0.2734,
"step": 18798
},
{
"epoch": 3.946574481458202,
"grad_norm": 1.0174254179000854,
"learning_rate": 5.3508694878258934e-05,
"loss": 0.278,
"step": 18837
},
{
"epoch": 3.9547454431175364,
"grad_norm": 0.9561147093772888,
"learning_rate": 5.334368317852889e-05,
"loss": 0.2552,
"step": 18876
},
{
"epoch": 3.9629164047768697,
"grad_norm": 1.178209662437439,
"learning_rate": 5.317863488911737e-05,
"loss": 0.2615,
"step": 18915
},
{
"epoch": 3.971087366436204,
"grad_norm": 1.1684329509735107,
"learning_rate": 5.3013551816135756e-05,
"loss": 0.283,
"step": 18954
},
{
"epoch": 3.979258328095537,
"grad_norm": 0.7574102878570557,
"learning_rate": 5.2848435766076096e-05,
"loss": 0.3018,
"step": 18993
},
{
"epoch": 3.9874292897548713,
"grad_norm": 1.389844298362732,
"learning_rate": 5.268328854579132e-05,
"loss": 0.2737,
"step": 19032
},
{
"epoch": 3.995600251414205,
"grad_norm": 0.8808802366256714,
"learning_rate": 5.251811196247541e-05,
"loss": 0.2656,
"step": 19071
},
{
"epoch": 4.003771213073539,
"grad_norm": 1.1782208681106567,
"learning_rate": 5.2352907823643715e-05,
"loss": 0.2257,
"step": 19110
},
{
"epoch": 4.011942174732872,
"grad_norm": 0.7572002410888672,
"learning_rate": 5.218767793711306e-05,
"loss": 0.1908,
"step": 19149
},
{
"epoch": 4.020113136392206,
"grad_norm": 1.2401790618896484,
"learning_rate": 5.202242411098206e-05,
"loss": 0.173,
"step": 19188
},
{
"epoch": 4.0282840980515395,
"grad_norm": 1.61008882522583,
"learning_rate": 5.1857148153611336e-05,
"loss": 0.1997,
"step": 19227
},
{
"epoch": 4.036455059710874,
"grad_norm": 1.042442798614502,
"learning_rate": 5.169185187360362e-05,
"loss": 0.1846,
"step": 19266
},
{
"epoch": 4.044626021370207,
"grad_norm": 1.8919810056686401,
"learning_rate": 5.1526537079784076e-05,
"loss": 0.1755,
"step": 19305
},
{
"epoch": 4.052796983029541,
"grad_norm": 1.427526593208313,
"learning_rate": 5.136120558118044e-05,
"loss": 0.1782,
"step": 19344
},
{
"epoch": 4.060967944688875,
"grad_norm": 0.9603130221366882,
"learning_rate": 5.119585918700327e-05,
"loss": 0.1934,
"step": 19383
},
{
"epoch": 4.069138906348209,
"grad_norm": 1.2158547639846802,
"learning_rate": 5.1030499706626126e-05,
"loss": 0.1834,
"step": 19422
},
{
"epoch": 4.077309868007543,
"grad_norm": 1.2463704347610474,
"learning_rate": 5.0865128949565735e-05,
"loss": 0.1956,
"step": 19461
},
{
"epoch": 4.085480829666876,
"grad_norm": 0.9006099700927734,
"learning_rate": 5.069974872546227e-05,
"loss": 0.1949,
"step": 19500
},
{
"epoch": 4.09365179132621,
"grad_norm": 1.2113624811172485,
"learning_rate": 5.053436084405946e-05,
"loss": 0.1901,
"step": 19539
},
{
"epoch": 4.1018227529855436,
"grad_norm": 1.0677685737609863,
"learning_rate": 5.036896711518485e-05,
"loss": 0.1904,
"step": 19578
},
{
"epoch": 4.109993714644878,
"grad_norm": 1.0412025451660156,
"learning_rate": 5.020356934872997e-05,
"loss": 0.1894,
"step": 19617
},
{
"epoch": 4.118164676304211,
"grad_norm": 1.1890144348144531,
"learning_rate": 5.0038169354630537e-05,
"loss": 0.1765,
"step": 19656
},
{
"epoch": 4.126335637963545,
"grad_norm": 1.4251190423965454,
"learning_rate": 4.9872768942846645e-05,
"loss": 0.1896,
"step": 19695
},
{
"epoch": 4.1345065996228785,
"grad_norm": 1.3054598569869995,
"learning_rate": 4.970736992334294e-05,
"loss": 0.1956,
"step": 19734
},
{
"epoch": 4.142677561282213,
"grad_norm": 1.2113808393478394,
"learning_rate": 4.9541974106068844e-05,
"loss": 0.1876,
"step": 19773
},
{
"epoch": 4.150848522941546,
"grad_norm": 1.1403577327728271,
"learning_rate": 4.9376583300938756e-05,
"loss": 0.1958,
"step": 19812
},
{
"epoch": 4.15901948460088,
"grad_norm": 1.28948974609375,
"learning_rate": 4.921119931781218e-05,
"loss": 0.1856,
"step": 19851
},
{
"epoch": 4.167190446260213,
"grad_norm": 1.168059229850769,
"learning_rate": 4.9045823966474046e-05,
"loss": 0.1867,
"step": 19890
},
{
"epoch": 4.175361407919548,
"grad_norm": 0.7937365770339966,
"learning_rate": 4.888045905661472e-05,
"loss": 0.1846,
"step": 19929
},
{
"epoch": 4.183532369578881,
"grad_norm": 0.8156687021255493,
"learning_rate": 4.871510639781043e-05,
"loss": 0.1879,
"step": 19968
},
{
"epoch": 4.190236748376283,
"eval_accuracy": 0.01002925168722868,
"eval_loss": 0.5226494073867798,
"eval_runtime": 848.4593,
"eval_samples_per_second": 5.64,
"eval_steps_per_second": 1.411,
"step": 20000
},
{
"epoch": 4.191703331238215,
"grad_norm": 1.2544173002243042,
"learning_rate": 4.854976779950323e-05,
"loss": 0.1893,
"step": 20007
},
{
"epoch": 4.199874292897548,
"grad_norm": 1.2081633806228638,
"learning_rate": 4.838444507098138e-05,
"loss": 0.1839,
"step": 20046
},
{
"epoch": 4.2080452545568825,
"grad_norm": 1.4302409887313843,
"learning_rate": 4.821914002135945e-05,
"loss": 0.1896,
"step": 20085
},
{
"epoch": 4.216216216216216,
"grad_norm": 0.8422350287437439,
"learning_rate": 4.8053854459558564e-05,
"loss": 0.1874,
"step": 20124
},
{
"epoch": 4.22438717787555,
"grad_norm": 1.034628987312317,
"learning_rate": 4.788859019428659e-05,
"loss": 0.187,
"step": 20163
},
{
"epoch": 4.232558139534884,
"grad_norm": 2.0154268741607666,
"learning_rate": 4.772334903401833e-05,
"loss": 0.1903,
"step": 20202
},
{
"epoch": 4.240729101194217,
"grad_norm": 1.2323023080825806,
"learning_rate": 4.755813278697579e-05,
"loss": 0.1722,
"step": 20241
},
{
"epoch": 4.248900062853552,
"grad_norm": 1.2616121768951416,
"learning_rate": 4.7392943261108297e-05,
"loss": 0.1864,
"step": 20280
},
{
"epoch": 4.257071024512885,
"grad_norm": 0.8832495212554932,
"learning_rate": 4.722778226407283e-05,
"loss": 0.1785,
"step": 20319
},
{
"epoch": 4.265241986172219,
"grad_norm": 1.1314343214035034,
"learning_rate": 4.706265160321412e-05,
"loss": 0.19,
"step": 20358
},
{
"epoch": 4.273412947831552,
"grad_norm": 1.3426203727722168,
"learning_rate": 4.689755308554501e-05,
"loss": 0.1907,
"step": 20397
},
{
"epoch": 4.2815839094908865,
"grad_norm": 1.469979166984558,
"learning_rate": 4.6732488517726494e-05,
"loss": 0.2,
"step": 20436
},
{
"epoch": 4.28975487115022,
"grad_norm": 0.8553899526596069,
"learning_rate": 4.6567459706048174e-05,
"loss": 0.1745,
"step": 20475
},
{
"epoch": 4.297925832809554,
"grad_norm": 1.4792917966842651,
"learning_rate": 4.640246845640827e-05,
"loss": 0.1895,
"step": 20514
},
{
"epoch": 4.306096794468887,
"grad_norm": 1.461832880973816,
"learning_rate": 4.623751657429404e-05,
"loss": 0.1936,
"step": 20553
},
{
"epoch": 4.314267756128221,
"grad_norm": 1.4538739919662476,
"learning_rate": 4.6072605864761906e-05,
"loss": 0.1883,
"step": 20592
},
{
"epoch": 4.322438717787555,
"grad_norm": 1.296778678894043,
"learning_rate": 4.590773813241775e-05,
"loss": 0.1964,
"step": 20631
},
{
"epoch": 4.330609679446889,
"grad_norm": 1.2757669687271118,
"learning_rate": 4.5742915181397154e-05,
"loss": 0.1893,
"step": 20670
},
{
"epoch": 4.338780641106222,
"grad_norm": 1.5511046648025513,
"learning_rate": 4.557813881534568e-05,
"loss": 0.186,
"step": 20709
},
{
"epoch": 4.346951602765556,
"grad_norm": 1.354141116142273,
"learning_rate": 4.541341083739908e-05,
"loss": 0.1947,
"step": 20748
},
{
"epoch": 4.35512256442489,
"grad_norm": 0.9544211030006409,
"learning_rate": 4.524873305016366e-05,
"loss": 0.2059,
"step": 20787
},
{
"epoch": 4.363293526084224,
"grad_norm": 1.3786882162094116,
"learning_rate": 4.508410725569639e-05,
"loss": 0.1909,
"step": 20826
},
{
"epoch": 4.371464487743557,
"grad_norm": 1.1708219051361084,
"learning_rate": 4.491953525548541e-05,
"loss": 0.186,
"step": 20865
},
{
"epoch": 4.379635449402891,
"grad_norm": 1.6227660179138184,
"learning_rate": 4.475501885043009e-05,
"loss": 0.1705,
"step": 20904
},
{
"epoch": 4.3878064110622255,
"grad_norm": 1.002186894416809,
"learning_rate": 4.459055984082149e-05,
"loss": 0.2023,
"step": 20943
},
{
"epoch": 4.395977372721559,
"grad_norm": 1.013697862625122,
"learning_rate": 4.442616002632256e-05,
"loss": 0.1989,
"step": 20982
},
{
"epoch": 4.404148334380893,
"grad_norm": 1.0331345796585083,
"learning_rate": 4.426182120594852e-05,
"loss": 0.2052,
"step": 21021
},
{
"epoch": 4.412319296040226,
"grad_norm": 0.5878667235374451,
"learning_rate": 4.40975451780471e-05,
"loss": 0.1815,
"step": 21060
},
{
"epoch": 4.42049025769956,
"grad_norm": 1.0225249528884888,
"learning_rate": 4.3933333740278876e-05,
"loss": 0.1875,
"step": 21099
},
{
"epoch": 4.428661219358894,
"grad_norm": 0.9736365079879761,
"learning_rate": 4.37691886895977e-05,
"loss": 0.1976,
"step": 21138
},
{
"epoch": 4.436832181018228,
"grad_norm": 0.976479709148407,
"learning_rate": 4.3605111822230884e-05,
"loss": 0.19,
"step": 21177
},
{
"epoch": 4.445003142677561,
"grad_norm": 1.0381896495819092,
"learning_rate": 4.3441104933659634e-05,
"loss": 0.186,
"step": 21216
},
{
"epoch": 4.453174104336895,
"grad_norm": 1.292851448059082,
"learning_rate": 4.327716981859938e-05,
"loss": 0.1848,
"step": 21255
},
{
"epoch": 4.461345065996229,
"grad_norm": 1.0627602338790894,
"learning_rate": 4.311330827098019e-05,
"loss": 0.1888,
"step": 21294
},
{
"epoch": 4.469516027655563,
"grad_norm": 1.0795786380767822,
"learning_rate": 4.294952208392699e-05,
"loss": 0.1997,
"step": 21333
},
{
"epoch": 4.477686989314896,
"grad_norm": 1.1152082681655884,
"learning_rate": 4.2785813049740135e-05,
"loss": 0.1813,
"step": 21372
},
{
"epoch": 4.48585795097423,
"grad_norm": 1.2858562469482422,
"learning_rate": 4.2622182959875636e-05,
"loss": 0.2072,
"step": 21411
},
{
"epoch": 4.4940289126335635,
"grad_norm": 1.4869050979614258,
"learning_rate": 4.245863360492567e-05,
"loss": 0.196,
"step": 21450
},
{
"epoch": 4.502199874292898,
"grad_norm": 1.1012799739837646,
"learning_rate": 4.2295166774598906e-05,
"loss": 0.188,
"step": 21489
},
{
"epoch": 4.510370835952231,
"grad_norm": 1.693291187286377,
"learning_rate": 4.213178425770096e-05,
"loss": 0.1935,
"step": 21528
},
{
"epoch": 4.518541797611565,
"grad_norm": 0.9888359308242798,
"learning_rate": 4.196848784211481e-05,
"loss": 0.1956,
"step": 21567
},
{
"epoch": 4.5267127592708984,
"grad_norm": 1.4018559455871582,
"learning_rate": 4.180527931478126e-05,
"loss": 0.2057,
"step": 21606
},
{
"epoch": 4.534883720930233,
"grad_norm": 1.321857213973999,
"learning_rate": 4.164216046167931e-05,
"loss": 0.2073,
"step": 21645
},
{
"epoch": 4.543054682589567,
"grad_norm": 1.2807749509811401,
"learning_rate": 4.147913306780673e-05,
"loss": 0.1973,
"step": 21684
},
{
"epoch": 4.5512256442489,
"grad_norm": 1.4632095098495483,
"learning_rate": 4.131619891716036e-05,
"loss": 0.1909,
"step": 21723
},
{
"epoch": 4.559396605908233,
"grad_norm": 1.3467892408370972,
"learning_rate": 4.1153359792716796e-05,
"loss": 0.1888,
"step": 21762
},
{
"epoch": 4.5675675675675675,
"grad_norm": 1.5710132122039795,
"learning_rate": 4.099061747641268e-05,
"loss": 0.1921,
"step": 21801
},
{
"epoch": 4.575738529226902,
"grad_norm": 1.444769263267517,
"learning_rate": 4.082797374912535e-05,
"loss": 0.1839,
"step": 21840
},
{
"epoch": 4.583909490886235,
"grad_norm": 0.9835878014564514,
"learning_rate": 4.0665430390653256e-05,
"loss": 0.2018,
"step": 21879
},
{
"epoch": 4.592080452545569,
"grad_norm": 1.1722490787506104,
"learning_rate": 4.050298917969654e-05,
"loss": 0.1912,
"step": 21918
},
{
"epoch": 4.6002514142049025,
"grad_norm": 1.0230695009231567,
"learning_rate": 4.0340651893837546e-05,
"loss": 0.203,
"step": 21957
},
{
"epoch": 4.608422375864237,
"grad_norm": 1.4747347831726074,
"learning_rate": 4.017842030952134e-05,
"loss": 0.2023,
"step": 21996
},
{
"epoch": 4.61659333752357,
"grad_norm": 1.2289817333221436,
"learning_rate": 4.001629620203637e-05,
"loss": 0.1899,
"step": 22035
},
{
"epoch": 4.624764299182904,
"grad_norm": 1.2174780368804932,
"learning_rate": 3.985428134549491e-05,
"loss": 0.1867,
"step": 22074
},
{
"epoch": 4.632935260842237,
"grad_norm": 1.4108147621154785,
"learning_rate": 3.969237751281375e-05,
"loss": 0.1881,
"step": 22113
},
{
"epoch": 4.641106222501572,
"grad_norm": 1.0761202573776245,
"learning_rate": 3.9530586475694725e-05,
"loss": 0.196,
"step": 22152
},
{
"epoch": 4.649277184160905,
"grad_norm": 1.2187368869781494,
"learning_rate": 3.936891000460541e-05,
"loss": 0.1878,
"step": 22191
},
{
"epoch": 4.657448145820239,
"grad_norm": 1.1273695230484009,
"learning_rate": 3.920734986875958e-05,
"loss": 0.1805,
"step": 22230
},
{
"epoch": 4.665619107479572,
"grad_norm": 1.0841587781906128,
"learning_rate": 3.904590783609811e-05,
"loss": 0.1961,
"step": 22269
},
{
"epoch": 4.6737900691389065,
"grad_norm": 1.5788570642471313,
"learning_rate": 3.888458567326936e-05,
"loss": 0.2023,
"step": 22308
},
{
"epoch": 4.68196103079824,
"grad_norm": 1.0329240560531616,
"learning_rate": 3.8723385145610034e-05,
"loss": 0.1811,
"step": 22347
},
{
"epoch": 4.690131992457574,
"grad_norm": 1.0109467506408691,
"learning_rate": 3.8562308017125736e-05,
"loss": 0.1842,
"step": 22386
},
{
"epoch": 4.698302954116907,
"grad_norm": 1.0209487676620483,
"learning_rate": 3.840135605047178e-05,
"loss": 0.1952,
"step": 22425
},
{
"epoch": 4.706473915776241,
"grad_norm": 1.1041748523712158,
"learning_rate": 3.82405310069338e-05,
"loss": 0.1918,
"step": 22464
},
{
"epoch": 4.714644877435575,
"grad_norm": 1.1960523128509521,
"learning_rate": 3.807983464640853e-05,
"loss": 0.1988,
"step": 22503
},
{
"epoch": 4.722815839094909,
"grad_norm": 1.1977349519729614,
"learning_rate": 3.7919268727384536e-05,
"loss": 0.1828,
"step": 22542
},
{
"epoch": 4.730986800754243,
"grad_norm": 1.1945958137512207,
"learning_rate": 3.775883500692302e-05,
"loss": 0.1866,
"step": 22581
},
{
"epoch": 4.739157762413576,
"grad_norm": 1.0714681148529053,
"learning_rate": 3.759853524063843e-05,
"loss": 0.2158,
"step": 22620
},
{
"epoch": 4.74732872407291,
"grad_norm": 1.3551781177520752,
"learning_rate": 3.7438371182679485e-05,
"loss": 0.2074,
"step": 22659
},
{
"epoch": 4.755499685732244,
"grad_norm": 0.9765615463256836,
"learning_rate": 3.727834458570979e-05,
"loss": 0.192,
"step": 22698
},
{
"epoch": 4.763670647391578,
"grad_norm": 1.1206810474395752,
"learning_rate": 3.711845720088875e-05,
"loss": 0.1958,
"step": 22737
},
{
"epoch": 4.771841609050911,
"grad_norm": 1.1183277368545532,
"learning_rate": 3.695871077785235e-05,
"loss": 0.1936,
"step": 22776
},
{
"epoch": 4.780012570710245,
"grad_norm": 1.167021632194519,
"learning_rate": 3.679910706469408e-05,
"loss": 0.2096,
"step": 22815
},
{
"epoch": 4.788183532369579,
"grad_norm": 1.72698974609375,
"learning_rate": 3.663964780794574e-05,
"loss": 0.2014,
"step": 22854
},
{
"epoch": 4.796354494028913,
"grad_norm": 1.7358726263046265,
"learning_rate": 3.648033475255837e-05,
"loss": 0.2082,
"step": 22893
},
{
"epoch": 4.804525455688246,
"grad_norm": 1.0222752094268799,
"learning_rate": 3.6321169641883135e-05,
"loss": 0.1821,
"step": 22932
},
{
"epoch": 4.81269641734758,
"grad_norm": 1.2598257064819336,
"learning_rate": 3.616215421765222e-05,
"loss": 0.189,
"step": 22971
},
{
"epoch": 4.820867379006914,
"grad_norm": 1.0362989902496338,
"learning_rate": 3.600329021995988e-05,
"loss": 0.1973,
"step": 23010
},
{
"epoch": 4.829038340666248,
"grad_norm": 1.0669738054275513,
"learning_rate": 3.584457938724322e-05,
"loss": 0.197,
"step": 23049
},
{
"epoch": 4.837209302325581,
"grad_norm": 1.6384053230285645,
"learning_rate": 3.568602345626337e-05,
"loss": 0.1812,
"step": 23088
},
{
"epoch": 4.845380263984915,
"grad_norm": 1.3113985061645508,
"learning_rate": 3.552762416208628e-05,
"loss": 0.1957,
"step": 23127
},
{
"epoch": 4.853551225644249,
"grad_norm": 0.9660436511039734,
"learning_rate": 3.5369383238063943e-05,
"loss": 0.2026,
"step": 23166
},
{
"epoch": 4.861722187303583,
"grad_norm": 1.9137030839920044,
"learning_rate": 3.521130241581524e-05,
"loss": 0.1884,
"step": 23205
},
{
"epoch": 4.869893148962916,
"grad_norm": 1.5459524393081665,
"learning_rate": 3.505338342520711e-05,
"loss": 0.1969,
"step": 23244
},
{
"epoch": 4.87806411062225,
"grad_norm": 0.9911507964134216,
"learning_rate": 3.489562799433555e-05,
"loss": 0.2015,
"step": 23283
},
{
"epoch": 4.886235072281584,
"grad_norm": 0.9340150952339172,
"learning_rate": 3.473803784950675e-05,
"loss": 0.1937,
"step": 23322
},
{
"epoch": 4.894406033940918,
"grad_norm": 1.160861849784851,
"learning_rate": 3.4580614715218185e-05,
"loss": 0.2105,
"step": 23361
},
{
"epoch": 4.902576995600251,
"grad_norm": 1.3635493516921997,
"learning_rate": 3.442336031413978e-05,
"loss": 0.2022,
"step": 23400
},
{
"epoch": 4.910747957259585,
"grad_norm": 1.386326551437378,
"learning_rate": 3.4266276367094927e-05,
"loss": 0.2011,
"step": 23439
},
{
"epoch": 4.918918918918919,
"grad_norm": 1.486642599105835,
"learning_rate": 3.4109364593041896e-05,
"loss": 0.2068,
"step": 23478
},
{
"epoch": 4.927089880578253,
"grad_norm": 1.4408907890319824,
"learning_rate": 3.395262670905474e-05,
"loss": 0.178,
"step": 23517
},
{
"epoch": 4.935260842237587,
"grad_norm": 0.9530181884765625,
"learning_rate": 3.379606443030475e-05,
"loss": 0.2038,
"step": 23556
},
{
"epoch": 4.94343180389692,
"grad_norm": 1.38958740234375,
"learning_rate": 3.3639679470041505e-05,
"loss": 0.193,
"step": 23595
},
{
"epoch": 4.951602765556254,
"grad_norm": 1.2210394144058228,
"learning_rate": 3.348347353957427e-05,
"loss": 0.1854,
"step": 23634
},
{
"epoch": 4.9597737272155875,
"grad_norm": 0.6801496148109436,
"learning_rate": 3.3327448348253144e-05,
"loss": 0.1873,
"step": 23673
},
{
"epoch": 4.967944688874922,
"grad_norm": 1.4520998001098633,
"learning_rate": 3.3171605603450435e-05,
"loss": 0.1955,
"step": 23712
},
{
"epoch": 4.976115650534255,
"grad_norm": 1.0612972974777222,
"learning_rate": 3.3015947010541954e-05,
"loss": 0.1849,
"step": 23751
},
{
"epoch": 4.984286612193589,
"grad_norm": 1.2622356414794922,
"learning_rate": 3.286047427288833e-05,
"loss": 0.1846,
"step": 23790
},
{
"epoch": 4.992457573852922,
"grad_norm": 1.1620802879333496,
"learning_rate": 3.270518909181642e-05,
"loss": 0.1833,
"step": 23829
},
{
"epoch": 5.000628535512257,
"grad_norm": 0.9522203803062439,
"learning_rate": 3.255009316660065e-05,
"loss": 0.211,
"step": 23868
},
{
"epoch": 5.00879949717159,
"grad_norm": 1.0376423597335815,
"learning_rate": 3.239518819444445e-05,
"loss": 0.1211,
"step": 23907
},
{
"epoch": 5.016970458830924,
"grad_norm": 1.249760627746582,
"learning_rate": 3.2240475870461596e-05,
"loss": 0.136,
"step": 23946
},
{
"epoch": 5.025141420490257,
"grad_norm": 1.1804068088531494,
"learning_rate": 3.208595788765786e-05,
"loss": 0.1167,
"step": 23985
},
{
"epoch": 5.0333123821495915,
"grad_norm": 1.0465995073318481,
"learning_rate": 3.193163593691218e-05,
"loss": 0.1261,
"step": 24024
},
{
"epoch": 5.041483343808925,
"grad_norm": 1.0377449989318848,
"learning_rate": 3.177751170695852e-05,
"loss": 0.1205,
"step": 24063
},
{
"epoch": 5.049654305468259,
"grad_norm": 1.3417456150054932,
"learning_rate": 3.162358688436703e-05,
"loss": 0.1091,
"step": 24102
},
{
"epoch": 5.057825267127592,
"grad_norm": 0.9311513900756836,
"learning_rate": 3.1469863153525874e-05,
"loss": 0.1158,
"step": 24141
},
{
"epoch": 5.0659962287869265,
"grad_norm": 1.3761860132217407,
"learning_rate": 3.131634219662262e-05,
"loss": 0.1208,
"step": 24180
},
{
"epoch": 5.074167190446261,
"grad_norm": 1.3109434843063354,
"learning_rate": 3.116302569362594e-05,
"loss": 0.1186,
"step": 24219
},
{
"epoch": 5.082338152105594,
"grad_norm": 1.0222351551055908,
"learning_rate": 3.1009915322267126e-05,
"loss": 0.121,
"step": 24258
},
{
"epoch": 5.090509113764928,
"grad_norm": 1.276938796043396,
"learning_rate": 3.085701275802185e-05,
"loss": 0.1255,
"step": 24297
},
{
"epoch": 5.098680075424261,
"grad_norm": 1.1321660280227661,
"learning_rate": 3.070431967409171e-05,
"loss": 0.1186,
"step": 24336
},
{
"epoch": 5.106851037083596,
"grad_norm": 1.3156259059906006,
"learning_rate": 3.055183774138606e-05,
"loss": 0.1173,
"step": 24375
},
{
"epoch": 5.115021998742929,
"grad_norm": 1.509464979171753,
"learning_rate": 3.03995686285035e-05,
"loss": 0.1272,
"step": 24414
},
{
"epoch": 5.123192960402263,
"grad_norm": 0.9557311534881592,
"learning_rate": 3.0247514001713906e-05,
"loss": 0.1199,
"step": 24453
},
{
"epoch": 5.131363922061596,
"grad_norm": 0.9602863192558289,
"learning_rate": 3.009567552493996e-05,
"loss": 0.1241,
"step": 24492
},
{
"epoch": 5.1395348837209305,
"grad_norm": 1.0989218950271606,
"learning_rate": 2.9944054859739062e-05,
"loss": 0.1268,
"step": 24531
},
{
"epoch": 5.147705845380264,
"grad_norm": 1.0586639642715454,
"learning_rate": 2.9792653665285096e-05,
"loss": 0.1252,
"step": 24570
},
{
"epoch": 5.155876807039598,
"grad_norm": 1.3455604314804077,
"learning_rate": 2.9641473598350322e-05,
"loss": 0.1299,
"step": 24609
},
{
"epoch": 5.164047768698931,
"grad_norm": 1.2703291177749634,
"learning_rate": 2.9490516313287196e-05,
"loss": 0.1196,
"step": 24648
},
{
"epoch": 5.172218730358265,
"grad_norm": 1.1739610433578491,
"learning_rate": 2.9339783462010282e-05,
"loss": 0.1216,
"step": 24687
},
{
"epoch": 5.180389692017599,
"grad_norm": 0.8743981719017029,
"learning_rate": 2.918927669397823e-05,
"loss": 0.1194,
"step": 24726
},
{
"epoch": 5.188560653676933,
"grad_norm": 0.9871875643730164,
"learning_rate": 2.903899765617557e-05,
"loss": 0.1179,
"step": 24765
},
{
"epoch": 5.196731615336266,
"grad_norm": 0.9458325505256653,
"learning_rate": 2.888894799309494e-05,
"loss": 0.123,
"step": 24804
},
{
"epoch": 5.2049025769956,
"grad_norm": 0.8422327637672424,
"learning_rate": 2.873912934671882e-05,
"loss": 0.1177,
"step": 24843
},
{
"epoch": 5.213073538654934,
"grad_norm": 1.0257890224456787,
"learning_rate": 2.8589543356501797e-05,
"loss": 0.1211,
"step": 24882
},
{
"epoch": 5.221244500314268,
"grad_norm": 1.1771949529647827,
"learning_rate": 2.844019165935244e-05,
"loss": 0.13,
"step": 24921
},
{
"epoch": 5.229415461973602,
"grad_norm": 1.2469607591629028,
"learning_rate": 2.829107588961549e-05,
"loss": 0.1185,
"step": 24960
},
{
"epoch": 5.237586423632935,
"grad_norm": 1.5078809261322021,
"learning_rate": 2.8142197679053945e-05,
"loss": 0.1295,
"step": 24999
},
{
"epoch": 5.245757385292269,
"grad_norm": 1.6324515342712402,
"learning_rate": 2.7993558656831253e-05,
"loss": 0.1231,
"step": 25038
},
{
"epoch": 5.253928346951603,
"grad_norm": 1.230471134185791,
"learning_rate": 2.7845160449493368e-05,
"loss": 0.1255,
"step": 25077
},
{
"epoch": 5.262099308610937,
"grad_norm": 0.8506200909614563,
"learning_rate": 2.769700468095113e-05,
"loss": 0.1229,
"step": 25116
},
{
"epoch": 5.27027027027027,
"grad_norm": 0.9934484362602234,
"learning_rate": 2.754909297246223e-05,
"loss": 0.1144,
"step": 25155
},
{
"epoch": 5.278441231929604,
"grad_norm": 0.8861384391784668,
"learning_rate": 2.7401426942613805e-05,
"loss": 0.122,
"step": 25194
},
{
"epoch": 5.286612193588938,
"grad_norm": 1.3057724237442017,
"learning_rate": 2.7254008207304404e-05,
"loss": 0.1232,
"step": 25233
},
{
"epoch": 5.294783155248272,
"grad_norm": 1.3942135572433472,
"learning_rate": 2.710683837972655e-05,
"loss": 0.1366,
"step": 25272
},
{
"epoch": 5.302954116907605,
"grad_norm": 1.338808298110962,
"learning_rate": 2.695991907034895e-05,
"loss": 0.1316,
"step": 25311
},
{
"epoch": 5.311125078566939,
"grad_norm": 1.4900381565093994,
"learning_rate": 2.6813251886898884e-05,
"loss": 0.1152,
"step": 25350
},
{
"epoch": 5.319296040226273,
"grad_norm": 1.2294018268585205,
"learning_rate": 2.6666838434344642e-05,
"loss": 0.1228,
"step": 25389
},
{
"epoch": 5.327467001885607,
"grad_norm": 0.8682739734649658,
"learning_rate": 2.6520680314878017e-05,
"loss": 0.1191,
"step": 25428
},
{
"epoch": 5.33563796354494,
"grad_norm": 1.5014339685440063,
"learning_rate": 2.637477912789662e-05,
"loss": 0.1224,
"step": 25467
},
{
"epoch": 5.343808925204274,
"grad_norm": 1.1386812925338745,
"learning_rate": 2.6229136469986583e-05,
"loss": 0.129,
"step": 25506
},
{
"epoch": 5.3519798868636075,
"grad_norm": 1.290597677230835,
"learning_rate": 2.608375393490483e-05,
"loss": 0.1306,
"step": 25545
},
{
"epoch": 5.360150848522942,
"grad_norm": 1.2870512008666992,
"learning_rate": 2.5938633113561866e-05,
"loss": 0.1151,
"step": 25584
},
{
"epoch": 5.368321810182275,
"grad_norm": 1.4389058351516724,
"learning_rate": 2.5793775594004306e-05,
"loss": 0.1225,
"step": 25623
},
{
"epoch": 5.376492771841609,
"grad_norm": 0.926723062992096,
"learning_rate": 2.5649182961397393e-05,
"loss": 0.1205,
"step": 25662
},
{
"epoch": 5.384663733500942,
"grad_norm": 1.104188323020935,
"learning_rate": 2.5504856798007815e-05,
"loss": 0.1209,
"step": 25701
},
{
"epoch": 5.392834695160277,
"grad_norm": 1.014277696609497,
"learning_rate": 2.536079868318625e-05,
"loss": 0.1306,
"step": 25740
},
{
"epoch": 5.40100565681961,
"grad_norm": 0.6340066194534302,
"learning_rate": 2.5217010193350154e-05,
"loss": 0.1222,
"step": 25779
},
{
"epoch": 5.409176618478944,
"grad_norm": 0.7401300668716431,
"learning_rate": 2.5073492901966477e-05,
"loss": 0.1249,
"step": 25818
},
{
"epoch": 5.417347580138278,
"grad_norm": 1.491322636604309,
"learning_rate": 2.4930248379534534e-05,
"loss": 0.1202,
"step": 25857
},
{
"epoch": 5.4255185417976115,
"grad_norm": 1.3365813493728638,
"learning_rate": 2.478727819356867e-05,
"loss": 0.1208,
"step": 25896
},
{
"epoch": 5.433689503456946,
"grad_norm": 1.0872201919555664,
"learning_rate": 2.464458390858122e-05,
"loss": 0.1239,
"step": 25935
},
{
"epoch": 5.441860465116279,
"grad_norm": 1.453033447265625,
"learning_rate": 2.4502167086065338e-05,
"loss": 0.12,
"step": 25974
},
{
"epoch": 5.450031426775613,
"grad_norm": 1.3627359867095947,
"learning_rate": 2.4360029284477977e-05,
"loss": 0.1221,
"step": 26013
},
{
"epoch": 5.458202388434946,
"grad_norm": 1.5153945684432983,
"learning_rate": 2.42181720592227e-05,
"loss": 0.128,
"step": 26052
},
{
"epoch": 5.466373350094281,
"grad_norm": 1.409711480140686,
"learning_rate": 2.407659696263284e-05,
"loss": 0.1252,
"step": 26091
},
{
"epoch": 5.474544311753614,
"grad_norm": 1.357224702835083,
"learning_rate": 2.393530554395435e-05,
"loss": 0.124,
"step": 26130
},
{
"epoch": 5.482715273412948,
"grad_norm": 1.200933575630188,
"learning_rate": 2.3794299349328924e-05,
"loss": 0.1302,
"step": 26169
},
{
"epoch": 5.490886235072281,
"grad_norm": 1.3625565767288208,
"learning_rate": 2.365357992177707e-05,
"loss": 0.1194,
"step": 26208
},
{
"epoch": 5.4990571967316155,
"grad_norm": 0.9201491475105286,
"learning_rate": 2.3513148801181263e-05,
"loss": 0.1275,
"step": 26247
},
{
"epoch": 5.507228158390949,
"grad_norm": 1.4374864101409912,
"learning_rate": 2.337300752426902e-05,
"loss": 0.1337,
"step": 26286
},
{
"epoch": 5.515399120050283,
"grad_norm": 1.3814523220062256,
"learning_rate": 2.3233157624596117e-05,
"loss": 0.1298,
"step": 26325
},
{
"epoch": 5.523570081709616,
"grad_norm": 1.5755550861358643,
"learning_rate": 2.309360063252981e-05,
"loss": 0.1292,
"step": 26364
},
{
"epoch": 5.5317410433689505,
"grad_norm": 1.1113964319229126,
"learning_rate": 2.2954338075232134e-05,
"loss": 0.1232,
"step": 26403
},
{
"epoch": 5.539912005028284,
"grad_norm": 1.1720763444900513,
"learning_rate": 2.2815371476643087e-05,
"loss": 0.1294,
"step": 26442
},
{
"epoch": 5.548082966687618,
"grad_norm": 1.1691449880599976,
"learning_rate": 2.2676702357464026e-05,
"loss": 0.1219,
"step": 26481
},
{
"epoch": 5.556253928346951,
"grad_norm": 1.1821155548095703,
"learning_rate": 2.2538332235141067e-05,
"loss": 0.124,
"step": 26520
},
{
"epoch": 5.564424890006285,
"grad_norm": 0.8197576999664307,
"learning_rate": 2.2400262623848316e-05,
"loss": 0.1209,
"step": 26559
},
{
"epoch": 5.5725958516656195,
"grad_norm": 1.083306908607483,
"learning_rate": 2.2262495034471533e-05,
"loss": 0.1253,
"step": 26598
},
{
"epoch": 5.580766813324953,
"grad_norm": 1.5298844575881958,
"learning_rate": 2.212503097459136e-05,
"loss": 0.1244,
"step": 26637
},
{
"epoch": 5.588937774984286,
"grad_norm": 1.1826508045196533,
"learning_rate": 2.1987871948467048e-05,
"loss": 0.1245,
"step": 26676
},
{
"epoch": 5.59710873664362,
"grad_norm": 1.1875622272491455,
"learning_rate": 2.18510194570198e-05,
"loss": 0.1267,
"step": 26715
},
{
"epoch": 5.6052796983029545,
"grad_norm": 1.174328327178955,
"learning_rate": 2.1714474997816463e-05,
"loss": 0.1196,
"step": 26754
},
{
"epoch": 5.613450659962288,
"grad_norm": 1.3389631509780884,
"learning_rate": 2.15782400650531e-05,
"loss": 0.1298,
"step": 26793
},
{
"epoch": 5.621621621621622,
"grad_norm": 0.7451180219650269,
"learning_rate": 2.144231614953869e-05,
"loss": 0.1228,
"step": 26832
},
{
"epoch": 5.629792583280955,
"grad_norm": 1.221951961517334,
"learning_rate": 2.1306704738678695e-05,
"loss": 0.124,
"step": 26871
},
{
"epoch": 5.637963544940289,
"grad_norm": 1.422272801399231,
"learning_rate": 2.1171407316458974e-05,
"loss": 0.1249,
"step": 26910
},
{
"epoch": 5.646134506599623,
"grad_norm": 1.273924469947815,
"learning_rate": 2.1036425363429284e-05,
"loss": 0.1251,
"step": 26949
},
{
"epoch": 5.654305468258957,
"grad_norm": 1.1979038715362549,
"learning_rate": 2.0901760356687367e-05,
"loss": 0.1227,
"step": 26988
},
{
"epoch": 5.66247642991829,
"grad_norm": 1.0692418813705444,
"learning_rate": 2.0767413769862543e-05,
"loss": 0.1249,
"step": 27027
},
{
"epoch": 5.670647391577624,
"grad_norm": 1.1677221059799194,
"learning_rate": 2.0633387073099762e-05,
"loss": 0.122,
"step": 27066
},
{
"epoch": 5.678818353236958,
"grad_norm": 1.2823810577392578,
"learning_rate": 2.049968173304339e-05,
"loss": 0.1222,
"step": 27105
},
{
"epoch": 5.686989314896292,
"grad_norm": 1.357614517211914,
"learning_rate": 2.036629921282123e-05,
"loss": 0.1174,
"step": 27144
},
{
"epoch": 5.695160276555625,
"grad_norm": 1.156685709953308,
"learning_rate": 2.0233240972028455e-05,
"loss": 0.1284,
"step": 27183
},
{
"epoch": 5.703331238214959,
"grad_norm": 1.4458805322647095,
"learning_rate": 2.010050846671175e-05,
"loss": 0.1263,
"step": 27222
},
{
"epoch": 5.7115021998742925,
"grad_norm": 1.15256929397583,
"learning_rate": 1.996810314935321e-05,
"loss": 0.1258,
"step": 27261
},
{
"epoch": 5.719673161533627,
"grad_norm": 1.0824165344238281,
"learning_rate": 1.983602646885464e-05,
"loss": 0.1256,
"step": 27300
},
{
"epoch": 5.727844123192961,
"grad_norm": 1.2394015789031982,
"learning_rate": 1.9704279870521475e-05,
"loss": 0.1346,
"step": 27339
},
{
"epoch": 5.736015084852294,
"grad_norm": 1.2990666627883911,
"learning_rate": 1.9572864796047153e-05,
"loss": 0.1351,
"step": 27378
},
{
"epoch": 5.7441860465116275,
"grad_norm": 1.368631362915039,
"learning_rate": 1.944178268349729e-05,
"loss": 0.1198,
"step": 27417
},
{
"epoch": 5.752357008170962,
"grad_norm": 1.3334980010986328,
"learning_rate": 1.9311034967293867e-05,
"loss": 0.1291,
"step": 27456
},
{
"epoch": 5.760527969830296,
"grad_norm": 1.0865552425384521,
"learning_rate": 1.9180623078199654e-05,
"loss": 0.1237,
"step": 27495
},
{
"epoch": 5.768698931489629,
"grad_norm": 1.2265640497207642,
"learning_rate": 1.9050548443302436e-05,
"loss": 0.1206,
"step": 27534
},
{
"epoch": 5.776869893148963,
"grad_norm": 1.4732778072357178,
"learning_rate": 1.8920812485999468e-05,
"loss": 0.1195,
"step": 27573
},
{
"epoch": 5.7850408548082966,
"grad_norm": 0.9459020495414734,
"learning_rate": 1.8791416625981866e-05,
"loss": 0.1263,
"step": 27612
},
{
"epoch": 5.793211816467631,
"grad_norm": 1.1469303369522095,
"learning_rate": 1.8662362279219138e-05,
"loss": 0.1231,
"step": 27651
},
{
"epoch": 5.801382778126964,
"grad_norm": 1.0641534328460693,
"learning_rate": 1.8533650857943574e-05,
"loss": 0.1243,
"step": 27690
},
{
"epoch": 5.809553739786298,
"grad_norm": 1.228223443031311,
"learning_rate": 1.84052837706349e-05,
"loss": 0.1246,
"step": 27729
},
{
"epoch": 5.8177247014456315,
"grad_norm": 1.339967966079712,
"learning_rate": 1.8277262422004775e-05,
"loss": 0.1257,
"step": 27768
},
{
"epoch": 5.825895663104966,
"grad_norm": 0.9778631329536438,
"learning_rate": 1.814958821298153e-05,
"loss": 0.1307,
"step": 27807
},
{
"epoch": 5.834066624764299,
"grad_norm": 1.2558242082595825,
"learning_rate": 1.8022262540694694e-05,
"loss": 0.124,
"step": 27846
},
{
"epoch": 5.842237586423633,
"grad_norm": 1.2275527715682983,
"learning_rate": 1.789528679845987e-05,
"loss": 0.1177,
"step": 27885
},
{
"epoch": 5.850408548082966,
"grad_norm": 0.6827887892723083,
"learning_rate": 1.776866237576334e-05,
"loss": 0.129,
"step": 27924
},
{
"epoch": 5.858579509742301,
"grad_norm": 0.9410415887832642,
"learning_rate": 1.7642390658246927e-05,
"loss": 0.1169,
"step": 27963
},
{
"epoch": 5.866750471401634,
"grad_norm": 1.3443763256072998,
"learning_rate": 1.7516473027692838e-05,
"loss": 0.1215,
"step": 28002
},
{
"epoch": 5.874921433060968,
"grad_norm": 1.5954539775848389,
"learning_rate": 1.739091086200855e-05,
"loss": 0.1203,
"step": 28041
},
{
"epoch": 5.883092394720301,
"grad_norm": 1.1915597915649414,
"learning_rate": 1.72657055352117e-05,
"loss": 0.1314,
"step": 28080
},
{
"epoch": 5.8912633563796355,
"grad_norm": 1.4117431640625,
"learning_rate": 1.7140858417415047e-05,
"loss": 0.1208,
"step": 28119
},
{
"epoch": 5.899434318038969,
"grad_norm": 1.5516468286514282,
"learning_rate": 1.7016370874811487e-05,
"loss": 0.1256,
"step": 28158
},
{
"epoch": 5.907605279698303,
"grad_norm": 1.514825701713562,
"learning_rate": 1.689224426965918e-05,
"loss": 0.1274,
"step": 28197
},
{
"epoch": 5.915776241357637,
"grad_norm": 1.0579338073730469,
"learning_rate": 1.6768479960266497e-05,
"loss": 0.1258,
"step": 28236
},
{
"epoch": 5.92394720301697,
"grad_norm": 1.042872667312622,
"learning_rate": 1.6645079300977268e-05,
"loss": 0.1235,
"step": 28275
},
{
"epoch": 5.932118164676305,
"grad_norm": 1.065023422241211,
"learning_rate": 1.6522043642155914e-05,
"loss": 0.1287,
"step": 28314
},
{
"epoch": 5.940289126335638,
"grad_norm": 0.9318491220474243,
"learning_rate": 1.6399374330172684e-05,
"loss": 0.1158,
"step": 28353
},
{
"epoch": 5.948460087994972,
"grad_norm": 1.3702102899551392,
"learning_rate": 1.627707270738894e-05,
"loss": 0.1193,
"step": 28392
},
{
"epoch": 5.956631049654305,
"grad_norm": 1.1150946617126465,
"learning_rate": 1.615514011214242e-05,
"loss": 0.1247,
"step": 28431
},
{
"epoch": 5.9648020113136395,
"grad_norm": 1.2311586141586304,
"learning_rate": 1.6033577878732646e-05,
"loss": 0.1108,
"step": 28470
},
{
"epoch": 5.972972972972973,
"grad_norm": 1.5974445343017578,
"learning_rate": 1.5912387337406282e-05,
"loss": 0.1184,
"step": 28509
},
{
"epoch": 5.981143934632307,
"grad_norm": 1.2024022340774536,
"learning_rate": 1.579156981434259e-05,
"loss": 0.1217,
"step": 28548
},
{
"epoch": 5.98931489629164,
"grad_norm": 0.7942213416099548,
"learning_rate": 1.5671126631638904e-05,
"loss": 0.1301,
"step": 28587
},
{
"epoch": 5.997485857950974,
"grad_norm": 0.8799581527709961,
"learning_rate": 1.555105910729624e-05,
"loss": 0.1234,
"step": 28626
},
{
"epoch": 6.005656819610308,
"grad_norm": 0.6884328722953796,
"learning_rate": 1.5431368555204756e-05,
"loss": 0.0935,
"step": 28665
},
{
"epoch": 6.013827781269642,
"grad_norm": 0.805902898311615,
"learning_rate": 1.5312056285129445e-05,
"loss": 0.0788,
"step": 28704
},
{
"epoch": 6.021998742928975,
"grad_norm": 1.2972137928009033,
"learning_rate": 1.5193123602695768e-05,
"loss": 0.0777,
"step": 28743
},
{
"epoch": 6.030169704588309,
"grad_norm": 1.0058882236480713,
"learning_rate": 1.5074571809375448e-05,
"loss": 0.0804,
"step": 28782
},
{
"epoch": 6.038340666247643,
"grad_norm": 1.093472957611084,
"learning_rate": 1.4956402202472081e-05,
"loss": 0.0757,
"step": 28821
},
{
"epoch": 6.046511627906977,
"grad_norm": 0.7502733469009399,
"learning_rate": 1.4838616075107109e-05,
"loss": 0.0774,
"step": 28860
},
{
"epoch": 6.05468258956631,
"grad_norm": 0.8688441514968872,
"learning_rate": 1.4721214716205528e-05,
"loss": 0.0776,
"step": 28899
},
{
"epoch": 6.062853551225644,
"grad_norm": 1.0032811164855957,
"learning_rate": 1.4604199410481855e-05,
"loss": 0.0797,
"step": 28938
},
{
"epoch": 6.071024512884978,
"grad_norm": 0.7208006978034973,
"learning_rate": 1.4487571438426045e-05,
"loss": 0.0728,
"step": 28977
},
{
"epoch": 6.079195474544312,
"grad_norm": 1.0811585187911987,
"learning_rate": 1.4371332076289533e-05,
"loss": 0.075,
"step": 29016
},
{
"epoch": 6.087366436203646,
"grad_norm": 1.0255405902862549,
"learning_rate": 1.4255482596071173e-05,
"loss": 0.0777,
"step": 29055
},
{
"epoch": 6.095537397862979,
"grad_norm": 0.777233362197876,
"learning_rate": 1.4140024265503398e-05,
"loss": 0.075,
"step": 29094
},
{
"epoch": 6.103708359522313,
"grad_norm": 0.6485877633094788,
"learning_rate": 1.4024958348038303e-05,
"loss": 0.0757,
"step": 29133
},
{
"epoch": 6.111879321181647,
"grad_norm": 0.903620183467865,
"learning_rate": 1.3910286102833831e-05,
"loss": 0.0769,
"step": 29172
},
{
"epoch": 6.120050282840981,
"grad_norm": 1.1562800407409668,
"learning_rate": 1.3796008784740033e-05,
"loss": 0.0754,
"step": 29211
},
{
"epoch": 6.128221244500314,
"grad_norm": 1.2452480792999268,
"learning_rate": 1.3682127644285253e-05,
"loss": 0.076,
"step": 29250
},
{
"epoch": 6.136392206159648,
"grad_norm": 1.2981902360916138,
"learning_rate": 1.356864392766256e-05,
"loss": 0.0795,
"step": 29289
},
{
"epoch": 6.144563167818982,
"grad_norm": 1.130982756614685,
"learning_rate": 1.3455558876715945e-05,
"loss": 0.081,
"step": 29328
},
{
"epoch": 6.152734129478316,
"grad_norm": 0.753930389881134,
"learning_rate": 1.3342873728926925e-05,
"loss": 0.081,
"step": 29367
},
{
"epoch": 6.160905091137649,
"grad_norm": 1.069864273071289,
"learning_rate": 1.3230589717400833e-05,
"loss": 0.0787,
"step": 29406
},
{
"epoch": 6.169076052796983,
"grad_norm": 0.5375205278396606,
"learning_rate": 1.3118708070853465e-05,
"loss": 0.0802,
"step": 29445
},
{
"epoch": 6.1772470144563165,
"grad_norm": 0.8602518439292908,
"learning_rate": 1.300723001359751e-05,
"loss": 0.0799,
"step": 29484
},
{
"epoch": 6.185417976115651,
"grad_norm": 0.8582605719566345,
"learning_rate": 1.2896156765529226e-05,
"loss": 0.0763,
"step": 29523
},
{
"epoch": 6.193588937774984,
"grad_norm": 1.3171412944793701,
"learning_rate": 1.2785489542115076e-05,
"loss": 0.0752,
"step": 29562
},
{
"epoch": 6.201759899434318,
"grad_norm": 0.9550933241844177,
"learning_rate": 1.2675229554378437e-05,
"loss": 0.0751,
"step": 29601
},
{
"epoch": 6.2099308610936514,
"grad_norm": 0.6330301761627197,
"learning_rate": 1.2565378008886309e-05,
"loss": 0.0776,
"step": 29640
},
{
"epoch": 6.218101822752986,
"grad_norm": 1.2466429471969604,
"learning_rate": 1.2455936107736193e-05,
"loss": 0.0771,
"step": 29679
},
{
"epoch": 6.226272784412319,
"grad_norm": 0.771265983581543,
"learning_rate": 1.2346905048542785e-05,
"loss": 0.0798,
"step": 29718
},
{
"epoch": 6.234443746071653,
"grad_norm": 0.9940363764762878,
"learning_rate": 1.223828602442506e-05,
"loss": 0.0772,
"step": 29757
},
{
"epoch": 6.242614707730986,
"grad_norm": 1.3721771240234375,
"learning_rate": 1.2130080223993073e-05,
"loss": 0.0807,
"step": 29796
},
{
"epoch": 6.2507856693903205,
"grad_norm": 0.8592971563339233,
"learning_rate": 1.2022288831335039e-05,
"loss": 0.0753,
"step": 29835
},
{
"epoch": 6.258956631049655,
"grad_norm": 1.4567697048187256,
"learning_rate": 1.1914913026004298e-05,
"loss": 0.0816,
"step": 29874
},
{
"epoch": 6.267127592708988,
"grad_norm": 0.986755907535553,
"learning_rate": 1.1807953983006476e-05,
"loss": 0.0816,
"step": 29913
},
{
"epoch": 6.275298554368322,
"grad_norm": 1.187451958656311,
"learning_rate": 1.1701412872786571e-05,
"loss": 0.0769,
"step": 29952
},
{
"epoch": 6.2834695160276555,
"grad_norm": 1.22755765914917,
"learning_rate": 1.1595290861216217e-05,
"loss": 0.0804,
"step": 29991
},
{
"epoch": 6.285355122564425,
"eval_accuracy": 0.009820309467613697,
"eval_loss": 0.6394578814506531,
"eval_runtime": 809.8591,
"eval_samples_per_second": 5.908,
"eval_steps_per_second": 1.478,
"step": 30000
},
{
"epoch": 6.29164047768699,
"grad_norm": 0.9166001081466675,
"learning_rate": 1.1489589109580857e-05,
"loss": 0.0817,
"step": 30030
},
{
"epoch": 6.299811439346323,
"grad_norm": 0.9760183691978455,
"learning_rate": 1.1384308774567048e-05,
"loss": 0.0767,
"step": 30069
},
{
"epoch": 6.307982401005657,
"grad_norm": 1.0955647230148315,
"learning_rate": 1.127945100824983e-05,
"loss": 0.0801,
"step": 30108
},
{
"epoch": 6.31615336266499,
"grad_norm": 0.9437817335128784,
"learning_rate": 1.1175016958080093e-05,
"loss": 0.0843,
"step": 30147
},
{
"epoch": 6.324324324324325,
"grad_norm": 0.9094163775444031,
"learning_rate": 1.1071007766872065e-05,
"loss": 0.0795,
"step": 30186
},
{
"epoch": 6.332495285983658,
"grad_norm": 1.1390501260757446,
"learning_rate": 1.096742457279073e-05,
"loss": 0.0785,
"step": 30225
},
{
"epoch": 6.340666247642992,
"grad_norm": 1.0265370607376099,
"learning_rate": 1.0864268509339454e-05,
"loss": 0.0746,
"step": 30264
},
{
"epoch": 6.348837209302325,
"grad_norm": 0.9284585118293762,
"learning_rate": 1.0761540705347516e-05,
"loss": 0.0813,
"step": 30303
},
{
"epoch": 6.3570081709616595,
"grad_norm": 0.9370750784873962,
"learning_rate": 1.0659242284957799e-05,
"loss": 0.0772,
"step": 30342
},
{
"epoch": 6.365179132620993,
"grad_norm": 1.0358294248580933,
"learning_rate": 1.0557374367614448e-05,
"loss": 0.0803,
"step": 30381
},
{
"epoch": 6.373350094280327,
"grad_norm": 1.1319186687469482,
"learning_rate": 1.0455938068050691e-05,
"loss": 0.0791,
"step": 30420
},
{
"epoch": 6.38152105593966,
"grad_norm": 1.2648582458496094,
"learning_rate": 1.0354934496276552e-05,
"loss": 0.0809,
"step": 30459
},
{
"epoch": 6.389692017598994,
"grad_norm": 1.2434043884277344,
"learning_rate": 1.0254364757566759e-05,
"loss": 0.0773,
"step": 30498
},
{
"epoch": 6.397862979258328,
"grad_norm": 1.1660242080688477,
"learning_rate": 1.015422995244863e-05,
"loss": 0.078,
"step": 30537
},
{
"epoch": 6.406033940917662,
"grad_norm": 1.1408329010009766,
"learning_rate": 1.0054531176690068e-05,
"loss": 0.0852,
"step": 30576
},
{
"epoch": 6.414204902576996,
"grad_norm": 0.8835148215293884,
"learning_rate": 9.955269521287486e-06,
"loss": 0.0765,
"step": 30615
},
{
"epoch": 6.422375864236329,
"grad_norm": 0.9826139211654663,
"learning_rate": 9.856446072453979e-06,
"loss": 0.0786,
"step": 30654
},
{
"epoch": 6.4305468258956635,
"grad_norm": 1.3077845573425293,
"learning_rate": 9.758061911607336e-06,
"loss": 0.0806,
"step": 30693
},
{
"epoch": 6.438717787554997,
"grad_norm": 1.2694175243377686,
"learning_rate": 9.660118115358263e-06,
"loss": 0.0788,
"step": 30732
},
{
"epoch": 6.446888749214331,
"grad_norm": 1.1133708953857422,
"learning_rate": 9.562615755498572e-06,
"loss": 0.0773,
"step": 30771
},
{
"epoch": 6.455059710873664,
"grad_norm": 0.8764528632164001,
"learning_rate": 9.46555589898952e-06,
"loss": 0.0796,
"step": 30810
},
{
"epoch": 6.463230672532998,
"grad_norm": 1.1313438415527344,
"learning_rate": 9.368939607950033e-06,
"loss": 0.0745,
"step": 30849
},
{
"epoch": 6.471401634192332,
"grad_norm": 1.3065073490142822,
"learning_rate": 9.272767939645138e-06,
"loss": 0.0771,
"step": 30888
},
{
"epoch": 6.479572595851666,
"grad_norm": 1.2640775442123413,
"learning_rate": 9.177041946474408e-06,
"loss": 0.0791,
"step": 30927
},
{
"epoch": 6.487743557510999,
"grad_norm": 1.4018340110778809,
"learning_rate": 9.081762675960398e-06,
"loss": 0.0829,
"step": 30966
},
{
"epoch": 6.495914519170333,
"grad_norm": 0.9985345602035522,
"learning_rate": 8.986931170737244e-06,
"loss": 0.0781,
"step": 31005
},
{
"epoch": 6.504085480829667,
"grad_norm": 1.4044971466064453,
"learning_rate": 8.892548468539186e-06,
"loss": 0.0801,
"step": 31044
},
{
"epoch": 6.512256442489001,
"grad_norm": 1.2589378356933594,
"learning_rate": 8.798615602189297e-06,
"loss": 0.0761,
"step": 31083
},
{
"epoch": 6.520427404148334,
"grad_norm": 0.7944054007530212,
"learning_rate": 8.705133599588033e-06,
"loss": 0.0788,
"step": 31122
},
{
"epoch": 6.528598365807668,
"grad_norm": 0.9065793752670288,
"learning_rate": 8.612103483702183e-06,
"loss": 0.0767,
"step": 31161
},
{
"epoch": 6.536769327467002,
"grad_norm": 1.2598634958267212,
"learning_rate": 8.519526272553501e-06,
"loss": 0.0778,
"step": 31200
},
{
"epoch": 6.544940289126336,
"grad_norm": 1.159141182899475,
"learning_rate": 8.427402979207699e-06,
"loss": 0.0817,
"step": 31239
},
{
"epoch": 6.553111250785669,
"grad_norm": 0.9926244616508484,
"learning_rate": 8.335734611763246e-06,
"loss": 0.0819,
"step": 31278
},
{
"epoch": 6.561282212445003,
"grad_norm": 1.3477091789245605,
"learning_rate": 8.24452217334042e-06,
"loss": 0.0771,
"step": 31317
},
{
"epoch": 6.569453174104337,
"grad_norm": 0.9037400484085083,
"learning_rate": 8.153766662070267e-06,
"loss": 0.0776,
"step": 31356
},
{
"epoch": 6.577624135763671,
"grad_norm": 0.8474344611167908,
"learning_rate": 8.063469071083767e-06,
"loss": 0.0783,
"step": 31395
},
{
"epoch": 6.585795097423004,
"grad_norm": 1.3324404954910278,
"learning_rate": 7.973630388500858e-06,
"loss": 0.0772,
"step": 31434
},
{
"epoch": 6.593966059082338,
"grad_norm": 1.1879874467849731,
"learning_rate": 7.884251597419733e-06,
"loss": 0.0766,
"step": 31473
},
{
"epoch": 6.602137020741672,
"grad_norm": 1.297776699066162,
"learning_rate": 7.795333675905953e-06,
"loss": 0.073,
"step": 31512
},
{
"epoch": 6.610307982401006,
"grad_norm": 1.2616163492202759,
"learning_rate": 7.706877596981887e-06,
"loss": 0.0753,
"step": 31551
},
{
"epoch": 6.61847894406034,
"grad_norm": 0.8011958599090576,
"learning_rate": 7.6188843286159386e-06,
"loss": 0.0769,
"step": 31590
},
{
"epoch": 6.626649905719673,
"grad_norm": 0.9508606195449829,
"learning_rate": 7.531354833712062e-06,
"loss": 0.0749,
"step": 31629
},
{
"epoch": 6.634820867379007,
"grad_norm": 1.0559347867965698,
"learning_rate": 7.444290070099131e-06,
"loss": 0.0781,
"step": 31668
},
{
"epoch": 6.6429918290383405,
"grad_norm": 0.8347210884094238,
"learning_rate": 7.357690990520516e-06,
"loss": 0.0749,
"step": 31707
},
{
"epoch": 6.651162790697675,
"grad_norm": 0.994552731513977,
"learning_rate": 7.271558542623618e-06,
"loss": 0.0795,
"step": 31746
},
{
"epoch": 6.659333752357008,
"grad_norm": 1.0162602663040161,
"learning_rate": 7.185893668949573e-06,
"loss": 0.0764,
"step": 31785
},
{
"epoch": 6.667504714016342,
"grad_norm": 1.130615472793579,
"learning_rate": 7.100697306922843e-06,
"loss": 0.0806,
"step": 31824
},
{
"epoch": 6.675675675675675,
"grad_norm": 0.9059649705886841,
"learning_rate": 7.015970388841004e-06,
"loss": 0.0771,
"step": 31863
},
{
"epoch": 6.68384663733501,
"grad_norm": 1.5551890134811401,
"learning_rate": 6.931713841864562e-06,
"loss": 0.0845,
"step": 31902
},
{
"epoch": 6.692017598994343,
"grad_norm": 1.3091990947723389,
"learning_rate": 6.847928588006747e-06,
"loss": 0.08,
"step": 31941
},
{
"epoch": 6.700188560653677,
"grad_norm": 1.2463719844818115,
"learning_rate": 6.764615544123526e-06,
"loss": 0.0842,
"step": 31980
},
{
"epoch": 6.70835952231301,
"grad_norm": 0.863456666469574,
"learning_rate": 6.681775621903441e-06,
"loss": 0.0804,
"step": 32019
},
{
"epoch": 6.7165304839723445,
"grad_norm": 0.8332017064094543,
"learning_rate": 6.599409727857753e-06,
"loss": 0.0745,
"step": 32058
},
{
"epoch": 6.724701445631678,
"grad_norm": 1.1635913848876953,
"learning_rate": 6.517518763310426e-06,
"loss": 0.0779,
"step": 32097
},
{
"epoch": 6.732872407291012,
"grad_norm": 1.0755412578582764,
"learning_rate": 6.436103624388329e-06,
"loss": 0.0768,
"step": 32136
},
{
"epoch": 6.741043368950345,
"grad_norm": 1.05198073387146,
"learning_rate": 6.355165202011376e-06,
"loss": 0.0815,
"step": 32175
},
{
"epoch": 6.7492143306096795,
"grad_norm": 1.1024662256240845,
"learning_rate": 6.2747043818828565e-06,
"loss": 0.0781,
"step": 32214
},
{
"epoch": 6.757385292269014,
"grad_norm": 1.3981742858886719,
"learning_rate": 6.19472204447965e-06,
"loss": 0.0823,
"step": 32253
},
{
"epoch": 6.765556253928347,
"grad_norm": 0.8348293304443359,
"learning_rate": 6.115219065042654e-06,
"loss": 0.075,
"step": 32292
},
{
"epoch": 6.77372721558768,
"grad_norm": 1.1249663829803467,
"learning_rate": 6.036196313567172e-06,
"loss": 0.0763,
"step": 32331
},
{
"epoch": 6.781898177247014,
"grad_norm": 0.8369470238685608,
"learning_rate": 5.9576546547934376e-06,
"loss": 0.0782,
"step": 32370
},
{
"epoch": 6.7900691389063486,
"grad_norm": 1.1055430173873901,
"learning_rate": 5.879594948197076e-06,
"loss": 0.0789,
"step": 32409
},
{
"epoch": 6.798240100565682,
"grad_norm": 0.6298280954360962,
"learning_rate": 5.802018047979796e-06,
"loss": 0.075,
"step": 32448
},
{
"epoch": 6.806411062225016,
"grad_norm": 1.0825151205062866,
"learning_rate": 5.724924803059961e-06,
"loss": 0.0777,
"step": 32487
},
{
"epoch": 6.814582023884349,
"grad_norm": 0.5902610421180725,
"learning_rate": 5.648316057063324e-06,
"loss": 0.0804,
"step": 32526
},
{
"epoch": 6.8227529855436835,
"grad_norm": 1.0861594676971436,
"learning_rate": 5.5721926483138174e-06,
"loss": 0.0771,
"step": 32565
},
{
"epoch": 6.830923947203017,
"grad_norm": 1.347240924835205,
"learning_rate": 5.496555409824367e-06,
"loss": 0.0812,
"step": 32604
},
{
"epoch": 6.839094908862351,
"grad_norm": 1.0528011322021484,
"learning_rate": 5.421405169287757e-06,
"loss": 0.0779,
"step": 32643
},
{
"epoch": 6.847265870521684,
"grad_norm": 0.6324211359024048,
"learning_rate": 5.346742749067602e-06,
"loss": 0.0775,
"step": 32682
},
{
"epoch": 6.855436832181018,
"grad_norm": 0.8667871952056885,
"learning_rate": 5.272568966189323e-06,
"loss": 0.0758,
"step": 32721
},
{
"epoch": 6.863607793840352,
"grad_norm": 1.3395044803619385,
"learning_rate": 5.198884632331213e-06,
"loss": 0.0814,
"step": 32760
},
{
"epoch": 6.871778755499686,
"grad_norm": 0.953387975692749,
"learning_rate": 5.1256905538155935e-06,
"loss": 0.0792,
"step": 32799
},
{
"epoch": 6.879949717159019,
"grad_norm": 1.0777264833450317,
"learning_rate": 5.052987531599917e-06,
"loss": 0.078,
"step": 32838
},
{
"epoch": 6.888120678818353,
"grad_norm": 0.7305690050125122,
"learning_rate": 4.980776361268086e-06,
"loss": 0.0803,
"step": 32877
},
{
"epoch": 6.896291640477687,
"grad_norm": 0.9803299307823181,
"learning_rate": 4.909057833021641e-06,
"loss": 0.0759,
"step": 32916
},
{
"epoch": 6.904462602137021,
"grad_norm": 1.090122938156128,
"learning_rate": 4.837832731671238e-06,
"loss": 0.0829,
"step": 32955
},
{
"epoch": 6.912633563796355,
"grad_norm": 1.2111872434616089,
"learning_rate": 4.767101836627963e-06,
"loss": 0.0776,
"step": 32994
},
{
"epoch": 6.920804525455688,
"grad_norm": 1.0040581226348877,
"learning_rate": 4.696865921894861e-06,
"loss": 0.0746,
"step": 33033
},
{
"epoch": 6.9289754871150215,
"grad_norm": 1.1057442426681519,
"learning_rate": 4.627125756058426e-06,
"loss": 0.0759,
"step": 33072
},
{
"epoch": 6.937146448774356,
"grad_norm": 1.387105941772461,
"learning_rate": 4.557882102280208e-06,
"loss": 0.0808,
"step": 33111
},
{
"epoch": 6.94531741043369,
"grad_norm": 1.119165301322937,
"learning_rate": 4.489135718288462e-06,
"loss": 0.0763,
"step": 33150
},
{
"epoch": 6.953488372093023,
"grad_norm": 1.335654616355896,
"learning_rate": 4.4208873563698796e-06,
"loss": 0.0787,
"step": 33189
},
{
"epoch": 6.961659333752357,
"grad_norm": 1.074046015739441,
"learning_rate": 4.353137763361303e-06,
"loss": 0.0801,
"step": 33228
},
{
"epoch": 6.969830295411691,
"grad_norm": 0.976398229598999,
"learning_rate": 4.285887680641598e-06,
"loss": 0.0791,
"step": 33267
},
{
"epoch": 6.978001257071025,
"grad_norm": 0.9488142728805542,
"learning_rate": 4.2191378441235106e-06,
"loss": 0.0818,
"step": 33306
},
{
"epoch": 6.986172218730358,
"grad_norm": 0.848330557346344,
"learning_rate": 4.152888984245656e-06,
"loss": 0.0763,
"step": 33345
},
{
"epoch": 6.994343180389692,
"grad_norm": 1.041487216949463,
"learning_rate": 4.087141825964469e-06,
"loss": 0.0791,
"step": 33384
},
{
"epoch": 7.002514142049026,
"grad_norm": 0.7451361417770386,
"learning_rate": 4.021897088746329e-06,
"loss": 0.0729,
"step": 33423
},
{
"epoch": 7.01068510370836,
"grad_norm": 0.6712239384651184,
"learning_rate": 3.957155486559633e-06,
"loss": 0.0587,
"step": 33462
},
{
"epoch": 7.018856065367693,
"grad_norm": 0.4839955270290375,
"learning_rate": 3.892917727867024e-06,
"loss": 0.0567,
"step": 33501
},
{
"epoch": 7.027027027027027,
"grad_norm": 0.7181938886642456,
"learning_rate": 3.829184515617601e-06,
"loss": 0.061,
"step": 33540
},
{
"epoch": 7.0351979886863605,
"grad_norm": 0.9729601144790649,
"learning_rate": 3.7659565472392856e-06,
"loss": 0.0605,
"step": 33579
},
{
"epoch": 7.043368950345695,
"grad_norm": 0.8530760407447815,
"learning_rate": 3.7032345146311264e-06,
"loss": 0.0574,
"step": 33618
},
{
"epoch": 7.051539912005028,
"grad_norm": 0.8998908996582031,
"learning_rate": 3.6410191041557463e-06,
"loss": 0.0592,
"step": 33657
},
{
"epoch": 7.059710873664362,
"grad_norm": 0.7366337180137634,
"learning_rate": 3.5793109966318627e-06,
"loss": 0.0579,
"step": 33696
},
{
"epoch": 7.067881835323695,
"grad_norm": 0.9024779200553894,
"learning_rate": 3.518110867326785e-06,
"loss": 0.0583,
"step": 33735
},
{
"epoch": 7.07605279698303,
"grad_norm": 0.9106222987174988,
"learning_rate": 3.45741938594909e-06,
"loss": 0.0582,
"step": 33774
},
{
"epoch": 7.084223758642363,
"grad_norm": 0.7838183045387268,
"learning_rate": 3.397237216641225e-06,
"loss": 0.0592,
"step": 33813
},
{
"epoch": 7.092394720301697,
"grad_norm": 0.7830872535705566,
"learning_rate": 3.337565017972305e-06,
"loss": 0.0584,
"step": 33852
},
{
"epoch": 7.100565681961031,
"grad_norm": 0.8272370100021362,
"learning_rate": 3.2784034429308276e-06,
"loss": 0.0585,
"step": 33891
},
{
"epoch": 7.1087366436203645,
"grad_norm": 0.8741708397865295,
"learning_rate": 3.2197531389176193e-06,
"loss": 0.0589,
"step": 33930
},
{
"epoch": 7.116907605279699,
"grad_norm": 0.9171742796897888,
"learning_rate": 3.161614747738667e-06,
"loss": 0.0591,
"step": 33969
},
{
"epoch": 7.125078566939032,
"grad_norm": 0.939560055732727,
"learning_rate": 3.103988905598171e-06,
"loss": 0.06,
"step": 34008
},
{
"epoch": 7.133249528598366,
"grad_norm": 0.8792917728424072,
"learning_rate": 3.0468762430915176e-06,
"loss": 0.0586,
"step": 34047
},
{
"epoch": 7.141420490257699,
"grad_norm": 0.7668449878692627,
"learning_rate": 2.990277385198409e-06,
"loss": 0.0569,
"step": 34086
},
{
"epoch": 7.149591451917034,
"grad_norm": 0.7971295118331909,
"learning_rate": 2.9341929512760156e-06,
"loss": 0.0588,
"step": 34125
},
{
"epoch": 7.157762413576367,
"grad_norm": 0.6899027228355408,
"learning_rate": 2.87862355505224e-06,
"loss": 0.0572,
"step": 34164
},
{
"epoch": 7.165933375235701,
"grad_norm": 0.6905819177627563,
"learning_rate": 2.8235698046189084e-06,
"loss": 0.0567,
"step": 34203
},
{
"epoch": 7.174104336895034,
"grad_norm": 0.8013573884963989,
"learning_rate": 2.769032302425234e-06,
"loss": 0.0586,
"step": 34242
},
{
"epoch": 7.1822752985543685,
"grad_norm": 0.7228589057922363,
"learning_rate": 2.715011645271093e-06,
"loss": 0.0573,
"step": 34281
},
{
"epoch": 7.190446260213702,
"grad_norm": 0.7927389144897461,
"learning_rate": 2.6615084243006348e-06,
"loss": 0.0595,
"step": 34320
},
{
"epoch": 7.198617221873036,
"grad_norm": 0.8713605403900146,
"learning_rate": 2.6085232249956903e-06,
"loss": 0.0573,
"step": 34359
},
{
"epoch": 7.206788183532369,
"grad_norm": 0.7807958722114563,
"learning_rate": 2.556056627169451e-06,
"loss": 0.0576,
"step": 34398
},
{
"epoch": 7.2149591451917034,
"grad_norm": 0.8027368783950806,
"learning_rate": 2.5041092049600646e-06,
"loss": 0.0549,
"step": 34437
},
{
"epoch": 7.223130106851037,
"grad_norm": 1.1289767026901245,
"learning_rate": 2.452681526824391e-06,
"loss": 0.0595,
"step": 34476
},
{
"epoch": 7.231301068510371,
"grad_norm": 0.7572363018989563,
"learning_rate": 2.401774155531772e-06,
"loss": 0.0599,
"step": 34515
},
{
"epoch": 7.239472030169704,
"grad_norm": 0.7623870372772217,
"learning_rate": 2.351387648157849e-06,
"loss": 0.0593,
"step": 34554
},
{
"epoch": 7.247642991829038,
"grad_norm": 0.8214910626411438,
"learning_rate": 2.301522556078517e-06,
"loss": 0.0583,
"step": 34593
},
{
"epoch": 7.2558139534883725,
"grad_norm": 1.091100811958313,
"learning_rate": 2.2521794249638516e-06,
"loss": 0.0591,
"step": 34632
},
{
"epoch": 7.263984915147706,
"grad_norm": 0.8992879986763,
"learning_rate": 2.2033587947721258e-06,
"loss": 0.0591,
"step": 34671
},
{
"epoch": 7.27215587680704,
"grad_norm": 0.7699416279792786,
"learning_rate": 2.1550611997439464e-06,
"loss": 0.057,
"step": 34710
},
{
"epoch": 7.280326838466373,
"grad_norm": 0.744487464427948,
"learning_rate": 2.1072871683963824e-06,
"loss": 0.0577,
"step": 34749
},
{
"epoch": 7.2884978001257075,
"grad_norm": 1.0493528842926025,
"learning_rate": 2.060037223517175e-06,
"loss": 0.056,
"step": 34788
},
{
"epoch": 7.296668761785041,
"grad_norm": 0.5897573232650757,
"learning_rate": 2.013311882159036e-06,
"loss": 0.059,
"step": 34827
},
{
"epoch": 7.304839723444375,
"grad_norm": 0.8210305571556091,
"learning_rate": 1.967111655633963e-06,
"loss": 0.0592,
"step": 34866
},
{
"epoch": 7.313010685103708,
"grad_norm": 0.845564603805542,
"learning_rate": 1.9214370495076793e-06,
"loss": 0.0587,
"step": 34905
},
{
"epoch": 7.321181646763042,
"grad_norm": 0.815892219543457,
"learning_rate": 1.876288563594064e-06,
"loss": 0.0596,
"step": 34944
},
{
"epoch": 7.329352608422376,
"grad_norm": 0.8726412057876587,
"learning_rate": 1.8316666919497238e-06,
"loss": 0.0585,
"step": 34983
},
{
"epoch": 7.33752357008171,
"grad_norm": 0.9395358562469482,
"learning_rate": 1.7875719228685362e-06,
"loss": 0.0573,
"step": 35022
},
{
"epoch": 7.345694531741043,
"grad_norm": 0.7253791689872742,
"learning_rate": 1.7440047388763536e-06,
"loss": 0.0587,
"step": 35061
},
{
"epoch": 7.353865493400377,
"grad_norm": 0.7980459928512573,
"learning_rate": 1.7009656167256916e-06,
"loss": 0.0574,
"step": 35100
},
{
"epoch": 7.362036455059711,
"grad_norm": 0.6651678681373596,
"learning_rate": 1.658455027390543e-06,
"loss": 0.0562,
"step": 35139
},
{
"epoch": 7.370207416719045,
"grad_norm": 0.7697405219078064,
"learning_rate": 1.6164734360611722e-06,
"loss": 0.0584,
"step": 35178
},
{
"epoch": 7.378378378378378,
"grad_norm": 0.7216864824295044,
"learning_rate": 1.5750213021390959e-06,
"loss": 0.0573,
"step": 35217
},
{
"epoch": 7.386549340037712,
"grad_norm": 0.6651769876480103,
"learning_rate": 1.534099079231982e-06,
"loss": 0.0574,
"step": 35256
},
{
"epoch": 7.3947203016970455,
"grad_norm": 0.7012266516685486,
"learning_rate": 1.493707215148743e-06,
"loss": 0.0599,
"step": 35295
},
{
"epoch": 7.40289126335638,
"grad_norm": 0.8982451558113098,
"learning_rate": 1.4538461518945945e-06,
"loss": 0.0584,
"step": 35334
},
{
"epoch": 7.411062225015713,
"grad_norm": 0.9067208766937256,
"learning_rate": 1.4145163256662596e-06,
"loss": 0.0588,
"step": 35373
},
{
"epoch": 7.419233186675047,
"grad_norm": 0.8877490162849426,
"learning_rate": 1.375718166847162e-06,
"loss": 0.059,
"step": 35412
},
{
"epoch": 7.4274041483343805,
"grad_norm": 1.038710117340088,
"learning_rate": 1.337452100002723e-06,
"loss": 0.0554,
"step": 35451
},
{
"epoch": 7.435575109993715,
"grad_norm": 0.9557565450668335,
"learning_rate": 1.299718543875722e-06,
"loss": 0.0605,
"step": 35490
},
{
"epoch": 7.443746071653049,
"grad_norm": 1.0523297786712646,
"learning_rate": 1.262517911381722e-06,
"loss": 0.0568,
"step": 35529
},
{
"epoch": 7.451917033312382,
"grad_norm": 1.0325270891189575,
"learning_rate": 1.225850609604534e-06,
"loss": 0.0624,
"step": 35568
},
{
"epoch": 7.460087994971716,
"grad_norm": 0.7806089520454407,
"learning_rate": 1.1897170397917656e-06,
"loss": 0.0588,
"step": 35607
},
{
"epoch": 7.4682589566310495,
"grad_norm": 0.9255437850952148,
"learning_rate": 1.154117597350457e-06,
"loss": 0.0596,
"step": 35646
},
{
"epoch": 7.476429918290384,
"grad_norm": 1.1577228307724,
"learning_rate": 1.1190526718426908e-06,
"loss": 0.0602,
"step": 35685
},
{
"epoch": 7.484600879949717,
"grad_norm": 0.7052533626556396,
"learning_rate": 1.0845226469814007e-06,
"loss": 0.0586,
"step": 35724
},
{
"epoch": 7.492771841609051,
"grad_norm": 0.9362422227859497,
"learning_rate": 1.050527900626136e-06,
"loss": 0.0575,
"step": 35763
},
{
"epoch": 7.5009428032683845,
"grad_norm": 0.7695039510726929,
"learning_rate": 1.0170688047789367e-06,
"loss": 0.0564,
"step": 35802
},
{
"epoch": 7.509113764927719,
"grad_norm": 0.7317225933074951,
"learning_rate": 9.841457255802434e-07,
"loss": 0.0576,
"step": 35841
},
{
"epoch": 7.517284726587052,
"grad_norm": 0.8200843930244446,
"learning_rate": 9.517590233049156e-07,
"loss": 0.055,
"step": 35880
},
{
"epoch": 7.525455688246386,
"grad_norm": 1.0528461933135986,
"learning_rate": 9.199090523582754e-07,
"loss": 0.0573,
"step": 35919
},
{
"epoch": 7.533626649905719,
"grad_norm": 0.7340377569198608,
"learning_rate": 8.885961612722371e-07,
"loss": 0.0568,
"step": 35958
},
{
"epoch": 7.541797611565054,
"grad_norm": 0.9129870533943176,
"learning_rate": 8.578206927014887e-07,
"loss": 0.0589,
"step": 35997
},
{
"epoch": 7.549968573224387,
"grad_norm": 0.8497533798217773,
"learning_rate": 8.275829834197446e-07,
"loss": 0.0598,
"step": 36036
},
{
"epoch": 7.558139534883721,
"grad_norm": 0.8787517547607422,
"learning_rate": 7.978833643160433e-07,
"loss": 0.0564,
"step": 36075
},
{
"epoch": 7.566310496543054,
"grad_norm": 0.8719078898429871,
"learning_rate": 7.687221603911666e-07,
"loss": 0.0574,
"step": 36114
},
{
"epoch": 7.5744814582023885,
"grad_norm": 0.9348114132881165,
"learning_rate": 7.400996907540314e-07,
"loss": 0.0585,
"step": 36153
},
{
"epoch": 7.582652419861722,
"grad_norm": 0.5755512118339539,
"learning_rate": 7.120162686182541e-07,
"loss": 0.0603,
"step": 36192
},
{
"epoch": 7.590823381521056,
"grad_norm": 0.6366615295410156,
"learning_rate": 6.844722012986692e-07,
"loss": 0.0576,
"step": 36231
},
{
"epoch": 7.59899434318039,
"grad_norm": 0.8005051016807556,
"learning_rate": 6.574677902079995e-07,
"loss": 0.0574,
"step": 36270
},
{
"epoch": 7.607165304839723,
"grad_norm": 0.7573316693305969,
"learning_rate": 6.310033308535523e-07,
"loss": 0.0569,
"step": 36309
},
{
"epoch": 7.615336266499057,
"grad_norm": 0.8414756059646606,
"learning_rate": 6.050791128339672e-07,
"loss": 0.0583,
"step": 36348
},
{
"epoch": 7.623507228158391,
"grad_norm": 0.8651686906814575,
"learning_rate": 5.796954198360905e-07,
"loss": 0.0572,
"step": 36387
},
{
"epoch": 7.631678189817725,
"grad_norm": 0.808057427406311,
"learning_rate": 5.548525296318163e-07,
"loss": 0.0588,
"step": 36426
},
{
"epoch": 7.639849151477058,
"grad_norm": 0.7138866186141968,
"learning_rate": 5.305507140750843e-07,
"loss": 0.0579,
"step": 36465
},
{
"epoch": 7.6480201131363925,
"grad_norm": 0.8983772993087769,
"learning_rate": 5.067902390988866e-07,
"loss": 0.0582,
"step": 36504
},
{
"epoch": 7.656191074795726,
"grad_norm": 0.9842520952224731,
"learning_rate": 4.835713647123818e-07,
"loss": 0.0592,
"step": 36543
},
{
"epoch": 7.66436203645506,
"grad_norm": 0.6843677163124084,
"learning_rate": 4.608943449980141e-07,
"loss": 0.0612,
"step": 36582
},
{
"epoch": 7.672532998114393,
"grad_norm": 0.8898476362228394,
"learning_rate": 4.387594281087593e-07,
"loss": 0.0577,
"step": 36621
},
{
"epoch": 7.680703959773727,
"grad_norm": 0.8016347885131836,
"learning_rate": 4.171668562654052e-07,
"loss": 0.0562,
"step": 36660
},
{
"epoch": 7.688874921433061,
"grad_norm": 0.9552780389785767,
"learning_rate": 3.9611686575388716e-07,
"loss": 0.0598,
"step": 36699
},
{
"epoch": 7.697045883092395,
"grad_norm": 0.7063938975334167,
"learning_rate": 3.756096869227177e-07,
"loss": 0.0567,
"step": 36738
},
{
"epoch": 7.705216844751728,
"grad_norm": 0.6397302746772766,
"learning_rate": 3.556455441804607e-07,
"loss": 0.0609,
"step": 36777
},
{
"epoch": 7.713387806411062,
"grad_norm": 0.8387033343315125,
"learning_rate": 3.362246559932891e-07,
"loss": 0.0588,
"step": 36816
},
{
"epoch": 7.721558768070396,
"grad_norm": 0.7996549606323242,
"learning_rate": 3.173472348825479e-07,
"loss": 0.058,
"step": 36855
},
{
"epoch": 7.72972972972973,
"grad_norm": 1.0533095598220825,
"learning_rate": 2.990134874224948e-07,
"loss": 0.0606,
"step": 36894
},
{
"epoch": 7.737900691389063,
"grad_norm": 1.0325462818145752,
"learning_rate": 2.812236142379909e-07,
"loss": 0.0545,
"step": 36933
},
{
"epoch": 7.746071653048397,
"grad_norm": 0.8322693109512329,
"learning_rate": 2.6397781000231934e-07,
"loss": 0.054,
"step": 36972
},
{
"epoch": 7.7542426147077315,
"grad_norm": 0.989177405834198,
"learning_rate": 2.4727626343506447e-07,
"loss": 0.0545,
"step": 37011
},
{
"epoch": 7.762413576367065,
"grad_norm": 0.8909984827041626,
"learning_rate": 2.3111915730003043e-07,
"loss": 0.0584,
"step": 37050
},
{
"epoch": 7.770584538026398,
"grad_norm": 0.8045913577079773,
"learning_rate": 2.1550666840325363e-07,
"loss": 0.0591,
"step": 37089
},
{
"epoch": 7.778755499685732,
"grad_norm": 0.8635491132736206,
"learning_rate": 2.0043896759106007e-07,
"loss": 0.057,
"step": 37128
},
{
"epoch": 7.786926461345066,
"grad_norm": 0.6361021995544434,
"learning_rate": 1.859162197482056e-07,
"loss": 0.0597,
"step": 37167
},
{
"epoch": 7.7950974230044,
"grad_norm": 0.6608051657676697,
"learning_rate": 1.719385837960663e-07,
"loss": 0.0551,
"step": 37206
},
{
"epoch": 7.803268384663734,
"grad_norm": 0.6683517694473267,
"learning_rate": 1.5850621269088984e-07,
"loss": 0.0561,
"step": 37245
},
{
"epoch": 7.811439346323067,
"grad_norm": 0.8555870056152344,
"learning_rate": 1.456192534221301e-07,
"loss": 0.0581,
"step": 37284
},
{
"epoch": 7.819610307982401,
"grad_norm": 0.8728473782539368,
"learning_rate": 1.3327784701084868e-07,
"loss": 0.0598,
"step": 37323
},
{
"epoch": 7.827781269641735,
"grad_norm": 0.810082733631134,
"learning_rate": 1.214821285081602e-07,
"loss": 0.058,
"step": 37362
},
{
"epoch": 7.835952231301069,
"grad_norm": 0.9439653754234314,
"learning_rate": 1.1023222699375057e-07,
"loss": 0.0569,
"step": 37401
},
{
"epoch": 7.844123192960402,
"grad_norm": 0.8079851269721985,
"learning_rate": 9.952826557447226e-08,
"loss": 0.0572,
"step": 37440
},
{
"epoch": 7.852294154619736,
"grad_norm": 0.750708281993866,
"learning_rate": 8.937036138298993e-08,
"loss": 0.0588,
"step": 37479
},
{
"epoch": 7.8604651162790695,
"grad_norm": 0.9982680678367615,
"learning_rate": 7.975862557652036e-08,
"loss": 0.0575,
"step": 37518
},
{
"epoch": 7.868636077938404,
"grad_norm": 1.1361589431762695,
"learning_rate": 7.06931633355834e-08,
"loss": 0.0579,
"step": 37557
},
{
"epoch": 7.876807039597737,
"grad_norm": 0.7971227169036865,
"learning_rate": 6.2174073862864e-08,
"loss": 0.0574,
"step": 37596
},
{
"epoch": 7.884978001257071,
"grad_norm": 1.0375800132751465,
"learning_rate": 5.4201450382151965e-08,
"loss": 0.0561,
"step": 37635
},
{
"epoch": 7.893148962916404,
"grad_norm": 0.6980372667312622,
"learning_rate": 4.677538013727612e-08,
"loss": 0.0582,
"step": 37674
},
{
"epoch": 7.901319924575739,
"grad_norm": 0.6636334657669067,
"learning_rate": 3.989594439118838e-08,
"loss": 0.0575,
"step": 37713
},
{
"epoch": 7.909490886235072,
"grad_norm": 0.8450205326080322,
"learning_rate": 3.356321842504784e-08,
"loss": 0.058,
"step": 37752
},
{
"epoch": 7.917661847894406,
"grad_norm": 0.7700900435447693,
"learning_rate": 2.7777271537410276e-08,
"loss": 0.0553,
"step": 37791
},
{
"epoch": 7.925832809553739,
"grad_norm": 0.7557202577590942,
"learning_rate": 2.2538167043484326e-08,
"loss": 0.0582,
"step": 37830
},
{
"epoch": 7.9340037712130735,
"grad_norm": 0.8920982480049133,
"learning_rate": 1.7845962274393168e-08,
"loss": 0.0585,
"step": 37869
},
{
"epoch": 7.942174732872408,
"grad_norm": 0.7392795085906982,
"learning_rate": 1.3700708576602772e-08,
"loss": 0.057,
"step": 37908
},
{
"epoch": 7.950345694531741,
"grad_norm": 0.6727070212364197,
"learning_rate": 1.0102451311316818e-08,
"loss": 0.0577,
"step": 37947
},
{
"epoch": 7.958516656191075,
"grad_norm": 1.0015376806259155,
"learning_rate": 7.051229854010411e-09,
"loss": 0.0573,
"step": 37986
},
{
"epoch": 7.9666876178504085,
"grad_norm": 0.8055163621902466,
"learning_rate": 4.547077593963778e-09,
"loss": 0.0589,
"step": 38025
},
{
"epoch": 7.974858579509743,
"grad_norm": 0.6668297052383423,
"learning_rate": 2.590021933945863e-09,
"loss": 0.0588,
"step": 38064
},
{
"epoch": 7.983029541169076,
"grad_norm": 1.031164526939392,
"learning_rate": 1.1800842898701535e-09,
"loss": 0.0579,
"step": 38103
},
{
"epoch": 7.99120050282841,
"grad_norm": 0.8890305161476135,
"learning_rate": 3.17280090578187e-10,
"loss": 0.0597,
"step": 38142
},
{
"epoch": 7.999371464487743,
"grad_norm": 0.9338026642799377,
"learning_rate": 1.6187776730181015e-12,
"loss": 0.0594,
"step": 38181
},
{
"epoch": 8.0,
"step": 38184,
"total_flos": 1.457270055797103e+19,
"train_loss": 0.24881920519854858,
"train_runtime": 60756.0963,
"train_samples_per_second": 2.514,
"train_steps_per_second": 0.628
}
],
"logging_steps": 39,
"max_steps": 38184,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.457270055797103e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}