NairaRahim's picture
Training in progress, epoch 77, checkpoint
39835b2 verified
raw
history blame
192 kB
{
"best_metric": 12.716951370239258,
"best_model_checkpoint": "/kaggle/working/output/checkpoint-88740",
"epoch": 77.0,
"eval_steps": 500,
"global_step": 100485,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07662835249042145,
"grad_norm": 8.174947738647461,
"learning_rate": 4.9952586206896554e-05,
"loss": 96.5258,
"step": 100
},
{
"epoch": 0.1532567049808429,
"grad_norm": 8.584559440612793,
"learning_rate": 4.990469348659004e-05,
"loss": 48.0822,
"step": 200
},
{
"epoch": 0.22988505747126436,
"grad_norm": 8.02587604522705,
"learning_rate": 4.985680076628353e-05,
"loss": 31.9469,
"step": 300
},
{
"epoch": 0.3065134099616858,
"grad_norm": 6.968703746795654,
"learning_rate": 4.9808908045977015e-05,
"loss": 24.973,
"step": 400
},
{
"epoch": 0.3831417624521073,
"grad_norm": 6.017839431762695,
"learning_rate": 4.97610153256705e-05,
"loss": 20.7473,
"step": 500
},
{
"epoch": 0.45977011494252873,
"grad_norm": 4.75618839263916,
"learning_rate": 4.971312260536399e-05,
"loss": 18.6219,
"step": 600
},
{
"epoch": 0.5363984674329502,
"grad_norm": 3.5624868869781494,
"learning_rate": 4.9665229885057475e-05,
"loss": 17.1775,
"step": 700
},
{
"epoch": 0.6130268199233716,
"grad_norm": 2.889848470687866,
"learning_rate": 4.961733716475096e-05,
"loss": 16.1131,
"step": 800
},
{
"epoch": 0.6896551724137931,
"grad_norm": 8.15518856048584,
"learning_rate": 4.956944444444445e-05,
"loss": 15.8697,
"step": 900
},
{
"epoch": 0.7662835249042146,
"grad_norm": 3.092848539352417,
"learning_rate": 4.952155172413793e-05,
"loss": 15.5523,
"step": 1000
},
{
"epoch": 0.842911877394636,
"grad_norm": 2.181015968322754,
"learning_rate": 4.9473659003831416e-05,
"loss": 15.5628,
"step": 1100
},
{
"epoch": 0.9195402298850575,
"grad_norm": 2.1515514850616455,
"learning_rate": 4.94257662835249e-05,
"loss": 15.3004,
"step": 1200
},
{
"epoch": 0.9961685823754789,
"grad_norm": 1.476803183555603,
"learning_rate": 4.937787356321839e-05,
"loss": 15.3448,
"step": 1300
},
{
"epoch": 1.0,
"eval_loss": 15.641121864318848,
"eval_runtime": 44.0061,
"eval_samples_per_second": 29.655,
"eval_steps_per_second": 3.727,
"step": 1305
},
{
"epoch": 1.0727969348659003,
"grad_norm": 3.050917863845825,
"learning_rate": 4.932998084291188e-05,
"loss": 14.901,
"step": 1400
},
{
"epoch": 1.1494252873563218,
"grad_norm": 1.6784011125564575,
"learning_rate": 4.928208812260537e-05,
"loss": 14.7073,
"step": 1500
},
{
"epoch": 1.2260536398467432,
"grad_norm": 3.2630977630615234,
"learning_rate": 4.923419540229886e-05,
"loss": 14.9142,
"step": 1600
},
{
"epoch": 1.3026819923371646,
"grad_norm": 1.6106696128845215,
"learning_rate": 4.9186302681992344e-05,
"loss": 14.9731,
"step": 1700
},
{
"epoch": 1.3793103448275863,
"grad_norm": 4.378266334533691,
"learning_rate": 4.9138409961685824e-05,
"loss": 14.5922,
"step": 1800
},
{
"epoch": 1.4559386973180077,
"grad_norm": 2.196368455886841,
"learning_rate": 4.909051724137931e-05,
"loss": 15.024,
"step": 1900
},
{
"epoch": 1.5325670498084292,
"grad_norm": 1.1820286512374878,
"learning_rate": 4.90426245210728e-05,
"loss": 14.6291,
"step": 2000
},
{
"epoch": 1.6091954022988506,
"grad_norm": 2.6733219623565674,
"learning_rate": 4.8994731800766285e-05,
"loss": 15.1916,
"step": 2100
},
{
"epoch": 1.685823754789272,
"grad_norm": 2.461630344390869,
"learning_rate": 4.894683908045977e-05,
"loss": 14.7438,
"step": 2200
},
{
"epoch": 1.7624521072796935,
"grad_norm": 1.7039703130722046,
"learning_rate": 4.889894636015326e-05,
"loss": 14.3014,
"step": 2300
},
{
"epoch": 1.839080459770115,
"grad_norm": 2.291198253631592,
"learning_rate": 4.8851053639846746e-05,
"loss": 14.5648,
"step": 2400
},
{
"epoch": 1.9157088122605364,
"grad_norm": 2.088695764541626,
"learning_rate": 4.880316091954023e-05,
"loss": 14.2778,
"step": 2500
},
{
"epoch": 1.9923371647509578,
"grad_norm": 1.9745572805404663,
"learning_rate": 4.875526819923372e-05,
"loss": 14.612,
"step": 2600
},
{
"epoch": 2.0,
"eval_loss": 14.947260856628418,
"eval_runtime": 44.059,
"eval_samples_per_second": 29.619,
"eval_steps_per_second": 3.722,
"step": 2610
},
{
"epoch": 2.0689655172413794,
"grad_norm": 3.296757698059082,
"learning_rate": 4.8707375478927206e-05,
"loss": 14.4268,
"step": 2700
},
{
"epoch": 2.1455938697318007,
"grad_norm": 1.2265104055404663,
"learning_rate": 4.865948275862069e-05,
"loss": 14.3716,
"step": 2800
},
{
"epoch": 2.2222222222222223,
"grad_norm": 2.3575916290283203,
"learning_rate": 4.861159003831418e-05,
"loss": 14.2911,
"step": 2900
},
{
"epoch": 2.2988505747126435,
"grad_norm": 1.535346508026123,
"learning_rate": 4.856369731800767e-05,
"loss": 14.0469,
"step": 3000
},
{
"epoch": 2.375478927203065,
"grad_norm": 2.3857269287109375,
"learning_rate": 4.8515804597701154e-05,
"loss": 14.0246,
"step": 3100
},
{
"epoch": 2.4521072796934864,
"grad_norm": 1.46570885181427,
"learning_rate": 4.846791187739464e-05,
"loss": 14.0864,
"step": 3200
},
{
"epoch": 2.528735632183908,
"grad_norm": 1.3398170471191406,
"learning_rate": 4.842001915708813e-05,
"loss": 14.1075,
"step": 3300
},
{
"epoch": 2.6053639846743293,
"grad_norm": 1.4247232675552368,
"learning_rate": 4.8372126436781614e-05,
"loss": 13.9681,
"step": 3400
},
{
"epoch": 2.681992337164751,
"grad_norm": 1.602295160293579,
"learning_rate": 4.83242337164751e-05,
"loss": 14.0847,
"step": 3500
},
{
"epoch": 2.7586206896551726,
"grad_norm": 1.8135626316070557,
"learning_rate": 4.827634099616858e-05,
"loss": 13.9871,
"step": 3600
},
{
"epoch": 2.835249042145594,
"grad_norm": 2.3612937927246094,
"learning_rate": 4.822844827586207e-05,
"loss": 14.043,
"step": 3700
},
{
"epoch": 2.9118773946360155,
"grad_norm": 2.1295549869537354,
"learning_rate": 4.8180555555555555e-05,
"loss": 14.0695,
"step": 3800
},
{
"epoch": 2.9885057471264367,
"grad_norm": 2.768362283706665,
"learning_rate": 4.813266283524904e-05,
"loss": 13.8804,
"step": 3900
},
{
"epoch": 3.0,
"eval_loss": 14.543105125427246,
"eval_runtime": 44.0531,
"eval_samples_per_second": 29.623,
"eval_steps_per_second": 3.723,
"step": 3915
},
{
"epoch": 3.0651340996168583,
"grad_norm": 2.190544366836548,
"learning_rate": 4.808477011494253e-05,
"loss": 13.8831,
"step": 4000
},
{
"epoch": 3.1417624521072796,
"grad_norm": 1.6555811166763306,
"learning_rate": 4.8036877394636016e-05,
"loss": 13.661,
"step": 4100
},
{
"epoch": 3.218390804597701,
"grad_norm": 1.1204612255096436,
"learning_rate": 4.798898467432951e-05,
"loss": 13.9753,
"step": 4200
},
{
"epoch": 3.2950191570881224,
"grad_norm": 2.3801109790802,
"learning_rate": 4.7941091954022996e-05,
"loss": 13.9332,
"step": 4300
},
{
"epoch": 3.371647509578544,
"grad_norm": 1.314393162727356,
"learning_rate": 4.7893199233716476e-05,
"loss": 13.8442,
"step": 4400
},
{
"epoch": 3.4482758620689653,
"grad_norm": 2.0551559925079346,
"learning_rate": 4.784530651340996e-05,
"loss": 13.5678,
"step": 4500
},
{
"epoch": 3.524904214559387,
"grad_norm": 1.4303470849990845,
"learning_rate": 4.779741379310345e-05,
"loss": 13.7754,
"step": 4600
},
{
"epoch": 3.6015325670498086,
"grad_norm": 2.2181780338287354,
"learning_rate": 4.774952107279694e-05,
"loss": 13.5568,
"step": 4700
},
{
"epoch": 3.67816091954023,
"grad_norm": 1.377549648284912,
"learning_rate": 4.7701628352490424e-05,
"loss": 13.4359,
"step": 4800
},
{
"epoch": 3.7547892720306515,
"grad_norm": 1.6644877195358276,
"learning_rate": 4.765373563218391e-05,
"loss": 13.6701,
"step": 4900
},
{
"epoch": 3.8314176245210727,
"grad_norm": 1.6416462659835815,
"learning_rate": 4.76058429118774e-05,
"loss": 13.6427,
"step": 5000
},
{
"epoch": 3.9080459770114944,
"grad_norm": 1.5726954936981201,
"learning_rate": 4.7557950191570885e-05,
"loss": 13.6802,
"step": 5100
},
{
"epoch": 3.9846743295019156,
"grad_norm": 1.3120722770690918,
"learning_rate": 4.751005747126437e-05,
"loss": 13.6631,
"step": 5200
},
{
"epoch": 4.0,
"eval_loss": 14.28848934173584,
"eval_runtime": 44.0456,
"eval_samples_per_second": 29.628,
"eval_steps_per_second": 3.723,
"step": 5220
},
{
"epoch": 4.061302681992337,
"grad_norm": 1.9124590158462524,
"learning_rate": 4.746216475095785e-05,
"loss": 13.5388,
"step": 5300
},
{
"epoch": 4.137931034482759,
"grad_norm": 1.3689558506011963,
"learning_rate": 4.741427203065134e-05,
"loss": 13.5553,
"step": 5400
},
{
"epoch": 4.21455938697318,
"grad_norm": 1.6370700597763062,
"learning_rate": 4.7366379310344825e-05,
"loss": 13.5781,
"step": 5500
},
{
"epoch": 4.291187739463601,
"grad_norm": 1.993304967880249,
"learning_rate": 4.731848659003832e-05,
"loss": 13.5261,
"step": 5600
},
{
"epoch": 4.3678160919540225,
"grad_norm": 2.3975770473480225,
"learning_rate": 4.7270593869731806e-05,
"loss": 13.4305,
"step": 5700
},
{
"epoch": 4.444444444444445,
"grad_norm": 1.9231036901474,
"learning_rate": 4.722270114942529e-05,
"loss": 13.3994,
"step": 5800
},
{
"epoch": 4.521072796934866,
"grad_norm": 1.0928981304168701,
"learning_rate": 4.717480842911878e-05,
"loss": 13.3212,
"step": 5900
},
{
"epoch": 4.597701149425287,
"grad_norm": 1.3092130422592163,
"learning_rate": 4.7126915708812266e-05,
"loss": 13.4476,
"step": 6000
},
{
"epoch": 4.674329501915709,
"grad_norm": 2.0151021480560303,
"learning_rate": 4.7079022988505747e-05,
"loss": 13.1863,
"step": 6100
},
{
"epoch": 4.75095785440613,
"grad_norm": 1.2778387069702148,
"learning_rate": 4.7031130268199233e-05,
"loss": 13.3661,
"step": 6200
},
{
"epoch": 4.827586206896552,
"grad_norm": 1.1671264171600342,
"learning_rate": 4.698371647509579e-05,
"loss": 13.3803,
"step": 6300
},
{
"epoch": 4.904214559386973,
"grad_norm": 0.9788312911987305,
"learning_rate": 4.693582375478928e-05,
"loss": 13.495,
"step": 6400
},
{
"epoch": 4.980842911877395,
"grad_norm": 3.2978639602661133,
"learning_rate": 4.6887931034482766e-05,
"loss": 13.4834,
"step": 6500
},
{
"epoch": 5.0,
"eval_loss": 14.041104316711426,
"eval_runtime": 43.9982,
"eval_samples_per_second": 29.66,
"eval_steps_per_second": 3.727,
"step": 6525
},
{
"epoch": 5.057471264367816,
"grad_norm": 1.6198067665100098,
"learning_rate": 4.6840038314176246e-05,
"loss": 13.1646,
"step": 6600
},
{
"epoch": 5.134099616858237,
"grad_norm": 5.732328414916992,
"learning_rate": 4.679214559386973e-05,
"loss": 13.4168,
"step": 6700
},
{
"epoch": 5.210727969348659,
"grad_norm": 1.518420934677124,
"learning_rate": 4.674425287356322e-05,
"loss": 13.2907,
"step": 6800
},
{
"epoch": 5.287356321839081,
"grad_norm": 1.6062932014465332,
"learning_rate": 4.6696360153256706e-05,
"loss": 13.406,
"step": 6900
},
{
"epoch": 5.363984674329502,
"grad_norm": 2.5659947395324707,
"learning_rate": 4.664846743295019e-05,
"loss": 13.252,
"step": 7000
},
{
"epoch": 5.440613026819923,
"grad_norm": 1.4965115785598755,
"learning_rate": 4.660057471264368e-05,
"loss": 13.2683,
"step": 7100
},
{
"epoch": 5.517241379310345,
"grad_norm": 2.3210604190826416,
"learning_rate": 4.655268199233717e-05,
"loss": 13.1846,
"step": 7200
},
{
"epoch": 5.593869731800766,
"grad_norm": 1.508138656616211,
"learning_rate": 4.6504789272030654e-05,
"loss": 13.1303,
"step": 7300
},
{
"epoch": 5.670498084291188,
"grad_norm": 1.2769402265548706,
"learning_rate": 4.645689655172414e-05,
"loss": 13.1109,
"step": 7400
},
{
"epoch": 5.747126436781609,
"grad_norm": 3.0062999725341797,
"learning_rate": 4.640900383141763e-05,
"loss": 13.1859,
"step": 7500
},
{
"epoch": 5.823754789272031,
"grad_norm": 1.4893639087677002,
"learning_rate": 4.636111111111111e-05,
"loss": 13.2236,
"step": 7600
},
{
"epoch": 5.900383141762452,
"grad_norm": 1.9955596923828125,
"learning_rate": 4.63132183908046e-05,
"loss": 13.2806,
"step": 7700
},
{
"epoch": 5.977011494252873,
"grad_norm": 1.733920931816101,
"learning_rate": 4.626532567049809e-05,
"loss": 12.9426,
"step": 7800
},
{
"epoch": 6.0,
"eval_loss": 13.950128555297852,
"eval_runtime": 44.0078,
"eval_samples_per_second": 29.654,
"eval_steps_per_second": 3.727,
"step": 7830
},
{
"epoch": 6.053639846743295,
"grad_norm": 1.3697247505187988,
"learning_rate": 4.6217432950191575e-05,
"loss": 13.001,
"step": 7900
},
{
"epoch": 6.130268199233717,
"grad_norm": 1.7222646474838257,
"learning_rate": 4.616954022988506e-05,
"loss": 13.1098,
"step": 8000
},
{
"epoch": 6.206896551724138,
"grad_norm": 1.5488767623901367,
"learning_rate": 4.612164750957855e-05,
"loss": 13.2406,
"step": 8100
},
{
"epoch": 6.283524904214559,
"grad_norm": 1.1356619596481323,
"learning_rate": 4.6073754789272036e-05,
"loss": 13.0969,
"step": 8200
},
{
"epoch": 6.360153256704981,
"grad_norm": 2.161534547805786,
"learning_rate": 4.602586206896552e-05,
"loss": 12.8021,
"step": 8300
},
{
"epoch": 6.436781609195402,
"grad_norm": 1.42888605594635,
"learning_rate": 4.5977969348659e-05,
"loss": 13.007,
"step": 8400
},
{
"epoch": 6.513409961685824,
"grad_norm": 1.5181623697280884,
"learning_rate": 4.593007662835249e-05,
"loss": 13.2494,
"step": 8500
},
{
"epoch": 6.590038314176245,
"grad_norm": 2.6794161796569824,
"learning_rate": 4.588218390804598e-05,
"loss": 13.0472,
"step": 8600
},
{
"epoch": 6.666666666666667,
"grad_norm": 1.3213189840316772,
"learning_rate": 4.5834291187739464e-05,
"loss": 12.7648,
"step": 8700
},
{
"epoch": 6.743295019157088,
"grad_norm": 1.1679490804672241,
"learning_rate": 4.578639846743295e-05,
"loss": 13.0907,
"step": 8800
},
{
"epoch": 6.819923371647509,
"grad_norm": 1.7697467803955078,
"learning_rate": 4.573850574712644e-05,
"loss": 12.8777,
"step": 8900
},
{
"epoch": 6.896551724137931,
"grad_norm": 1.7574371099472046,
"learning_rate": 4.5690613026819924e-05,
"loss": 12.8949,
"step": 9000
},
{
"epoch": 6.973180076628353,
"grad_norm": 1.8508405685424805,
"learning_rate": 4.564272030651342e-05,
"loss": 13.0364,
"step": 9100
},
{
"epoch": 7.0,
"eval_loss": 13.742591857910156,
"eval_runtime": 44.1082,
"eval_samples_per_second": 29.586,
"eval_steps_per_second": 3.718,
"step": 9135
},
{
"epoch": 7.049808429118774,
"grad_norm": 1.304430365562439,
"learning_rate": 4.55948275862069e-05,
"loss": 13.1197,
"step": 9200
},
{
"epoch": 7.126436781609195,
"grad_norm": 1.112478256225586,
"learning_rate": 4.5546934865900385e-05,
"loss": 13.072,
"step": 9300
},
{
"epoch": 7.203065134099617,
"grad_norm": 1.6277681589126587,
"learning_rate": 4.5499521072796937e-05,
"loss": 12.8787,
"step": 9400
},
{
"epoch": 7.2796934865900385,
"grad_norm": 1.6854459047317505,
"learning_rate": 4.5451628352490423e-05,
"loss": 12.9961,
"step": 9500
},
{
"epoch": 7.35632183908046,
"grad_norm": 1.5988355875015259,
"learning_rate": 4.540373563218391e-05,
"loss": 12.9588,
"step": 9600
},
{
"epoch": 7.432950191570881,
"grad_norm": 1.0676491260528564,
"learning_rate": 4.53558429118774e-05,
"loss": 12.8359,
"step": 9700
},
{
"epoch": 7.509578544061303,
"grad_norm": 1.8556437492370605,
"learning_rate": 4.5307950191570884e-05,
"loss": 12.813,
"step": 9800
},
{
"epoch": 7.586206896551724,
"grad_norm": 1.5877550840377808,
"learning_rate": 4.526005747126437e-05,
"loss": 12.9205,
"step": 9900
},
{
"epoch": 7.662835249042145,
"grad_norm": 1.2095483541488647,
"learning_rate": 4.521216475095786e-05,
"loss": 12.9472,
"step": 10000
},
{
"epoch": 7.739463601532567,
"grad_norm": 3.998228073120117,
"learning_rate": 4.5164272030651345e-05,
"loss": 12.871,
"step": 10100
},
{
"epoch": 7.816091954022989,
"grad_norm": 1.4408106803894043,
"learning_rate": 4.511637931034483e-05,
"loss": 12.9723,
"step": 10200
},
{
"epoch": 7.89272030651341,
"grad_norm": 0.9685239791870117,
"learning_rate": 4.506848659003832e-05,
"loss": 12.7816,
"step": 10300
},
{
"epoch": 7.969348659003831,
"grad_norm": 2.4164698123931885,
"learning_rate": 4.5020593869731805e-05,
"loss": 12.8656,
"step": 10400
},
{
"epoch": 8.0,
"eval_loss": 13.643902778625488,
"eval_runtime": 44.1312,
"eval_samples_per_second": 29.571,
"eval_steps_per_second": 3.716,
"step": 10440
},
{
"epoch": 8.045977011494253,
"grad_norm": 1.4973284006118774,
"learning_rate": 4.497270114942529e-05,
"loss": 12.9654,
"step": 10500
},
{
"epoch": 8.122605363984674,
"grad_norm": 1.9837547540664673,
"learning_rate": 4.492480842911877e-05,
"loss": 12.9358,
"step": 10600
},
{
"epoch": 8.199233716475096,
"grad_norm": 2.1501142978668213,
"learning_rate": 4.487691570881226e-05,
"loss": 12.9226,
"step": 10700
},
{
"epoch": 8.275862068965518,
"grad_norm": 1.959155797958374,
"learning_rate": 4.4829022988505746e-05,
"loss": 12.8136,
"step": 10800
},
{
"epoch": 8.352490421455938,
"grad_norm": 1.7081148624420166,
"learning_rate": 4.478113026819923e-05,
"loss": 12.6215,
"step": 10900
},
{
"epoch": 8.42911877394636,
"grad_norm": 3.0818092823028564,
"learning_rate": 4.473323754789272e-05,
"loss": 12.7263,
"step": 11000
},
{
"epoch": 8.505747126436782,
"grad_norm": 1.2609460353851318,
"learning_rate": 4.468534482758621e-05,
"loss": 12.615,
"step": 11100
},
{
"epoch": 8.582375478927203,
"grad_norm": 1.1553901433944702,
"learning_rate": 4.46374521072797e-05,
"loss": 12.9115,
"step": 11200
},
{
"epoch": 8.659003831417625,
"grad_norm": 2.876321792602539,
"learning_rate": 4.458955938697319e-05,
"loss": 12.8372,
"step": 11300
},
{
"epoch": 8.735632183908045,
"grad_norm": 2.3537096977233887,
"learning_rate": 4.454166666666667e-05,
"loss": 12.8684,
"step": 11400
},
{
"epoch": 8.812260536398467,
"grad_norm": 1.4264323711395264,
"learning_rate": 4.4493773946360154e-05,
"loss": 12.6151,
"step": 11500
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.8997728824615479,
"learning_rate": 4.4446360153256706e-05,
"loss": 12.8187,
"step": 11600
},
{
"epoch": 8.96551724137931,
"grad_norm": 1.8338580131530762,
"learning_rate": 4.439846743295019e-05,
"loss": 12.7365,
"step": 11700
},
{
"epoch": 9.0,
"eval_loss": 13.53819465637207,
"eval_runtime": 44.0314,
"eval_samples_per_second": 29.638,
"eval_steps_per_second": 3.725,
"step": 11745
},
{
"epoch": 9.042145593869732,
"grad_norm": 12.737005233764648,
"learning_rate": 4.4351053639846745e-05,
"loss": 12.8002,
"step": 11800
},
{
"epoch": 9.118773946360154,
"grad_norm": 1.8820631504058838,
"learning_rate": 4.430316091954023e-05,
"loss": 12.8415,
"step": 11900
},
{
"epoch": 9.195402298850574,
"grad_norm": 1.5012093782424927,
"learning_rate": 4.425526819923372e-05,
"loss": 12.8011,
"step": 12000
},
{
"epoch": 9.272030651340996,
"grad_norm": 2.5062639713287354,
"learning_rate": 4.4207375478927205e-05,
"loss": 12.7156,
"step": 12100
},
{
"epoch": 9.348659003831418,
"grad_norm": 1.5295358896255493,
"learning_rate": 4.415948275862069e-05,
"loss": 12.8449,
"step": 12200
},
{
"epoch": 9.425287356321839,
"grad_norm": 1.6232823133468628,
"learning_rate": 4.411159003831418e-05,
"loss": 12.7345,
"step": 12300
},
{
"epoch": 9.50191570881226,
"grad_norm": 1.4783318042755127,
"learning_rate": 4.4063697318007666e-05,
"loss": 12.7392,
"step": 12400
},
{
"epoch": 9.578544061302683,
"grad_norm": 1.7494572401046753,
"learning_rate": 4.4015804597701146e-05,
"loss": 12.6017,
"step": 12500
},
{
"epoch": 9.655172413793103,
"grad_norm": 2.065991163253784,
"learning_rate": 4.396791187739464e-05,
"loss": 12.695,
"step": 12600
},
{
"epoch": 9.731800766283525,
"grad_norm": 1.2360838651657104,
"learning_rate": 4.3920019157088127e-05,
"loss": 12.7994,
"step": 12700
},
{
"epoch": 9.808429118773946,
"grad_norm": 2.084902048110962,
"learning_rate": 4.3872126436781613e-05,
"loss": 12.6864,
"step": 12800
},
{
"epoch": 9.885057471264368,
"grad_norm": 1.4381409883499146,
"learning_rate": 4.38242337164751e-05,
"loss": 12.6875,
"step": 12900
},
{
"epoch": 9.96168582375479,
"grad_norm": 1.5936471223831177,
"learning_rate": 4.377634099616859e-05,
"loss": 12.6413,
"step": 13000
},
{
"epoch": 10.0,
"eval_loss": 13.456477165222168,
"eval_runtime": 44.0741,
"eval_samples_per_second": 29.609,
"eval_steps_per_second": 3.721,
"step": 13050
},
{
"epoch": 10.03831417624521,
"grad_norm": 1.1829323768615723,
"learning_rate": 4.3728448275862074e-05,
"loss": 12.7182,
"step": 13100
},
{
"epoch": 10.114942528735632,
"grad_norm": 1.7679022550582886,
"learning_rate": 4.368055555555556e-05,
"loss": 12.7508,
"step": 13200
},
{
"epoch": 10.191570881226054,
"grad_norm": 2.4053192138671875,
"learning_rate": 4.363266283524904e-05,
"loss": 12.5668,
"step": 13300
},
{
"epoch": 10.268199233716475,
"grad_norm": 2.4858756065368652,
"learning_rate": 4.358477011494253e-05,
"loss": 12.6561,
"step": 13400
},
{
"epoch": 10.344827586206897,
"grad_norm": 2.138453483581543,
"learning_rate": 4.3536877394636015e-05,
"loss": 12.6829,
"step": 13500
},
{
"epoch": 10.421455938697317,
"grad_norm": 1.490075707435608,
"learning_rate": 4.34889846743295e-05,
"loss": 12.7284,
"step": 13600
},
{
"epoch": 10.49808429118774,
"grad_norm": 3.1338703632354736,
"learning_rate": 4.344109195402299e-05,
"loss": 12.5722,
"step": 13700
},
{
"epoch": 10.574712643678161,
"grad_norm": 1.844388723373413,
"learning_rate": 4.3393199233716475e-05,
"loss": 12.8212,
"step": 13800
},
{
"epoch": 10.651340996168582,
"grad_norm": 1.9379137754440308,
"learning_rate": 4.334530651340996e-05,
"loss": 12.368,
"step": 13900
},
{
"epoch": 10.727969348659004,
"grad_norm": 4.608842849731445,
"learning_rate": 4.3297413793103456e-05,
"loss": 12.3258,
"step": 14000
},
{
"epoch": 10.804597701149426,
"grad_norm": 1.607155680656433,
"learning_rate": 4.325e-05,
"loss": 12.8355,
"step": 14100
},
{
"epoch": 10.881226053639846,
"grad_norm": 1.7595943212509155,
"learning_rate": 4.320210727969349e-05,
"loss": 12.6135,
"step": 14200
},
{
"epoch": 10.957854406130268,
"grad_norm": 1.7879704236984253,
"learning_rate": 4.3154214559386975e-05,
"loss": 12.7107,
"step": 14300
},
{
"epoch": 11.0,
"eval_loss": 13.364398002624512,
"eval_runtime": 44.0273,
"eval_samples_per_second": 29.641,
"eval_steps_per_second": 3.725,
"step": 14355
},
{
"epoch": 11.03448275862069,
"grad_norm": 3.187349557876587,
"learning_rate": 4.310632183908046e-05,
"loss": 12.7471,
"step": 14400
},
{
"epoch": 11.11111111111111,
"grad_norm": 3.118311643600464,
"learning_rate": 4.305842911877395e-05,
"loss": 12.4422,
"step": 14500
},
{
"epoch": 11.187739463601533,
"grad_norm": 2.276580333709717,
"learning_rate": 4.3010536398467435e-05,
"loss": 12.5443,
"step": 14600
},
{
"epoch": 11.264367816091955,
"grad_norm": 1.3369340896606445,
"learning_rate": 4.296264367816092e-05,
"loss": 12.7497,
"step": 14700
},
{
"epoch": 11.340996168582375,
"grad_norm": 1.2438215017318726,
"learning_rate": 4.291475095785441e-05,
"loss": 12.6343,
"step": 14800
},
{
"epoch": 11.417624521072797,
"grad_norm": 1.668867826461792,
"learning_rate": 4.2866858237547896e-05,
"loss": 12.673,
"step": 14900
},
{
"epoch": 11.494252873563218,
"grad_norm": 2.550316572189331,
"learning_rate": 4.281896551724138e-05,
"loss": 12.7346,
"step": 15000
},
{
"epoch": 11.57088122605364,
"grad_norm": 1.3926326036453247,
"learning_rate": 4.277107279693487e-05,
"loss": 12.5431,
"step": 15100
},
{
"epoch": 11.647509578544062,
"grad_norm": 1.3561134338378906,
"learning_rate": 4.272318007662836e-05,
"loss": 12.4943,
"step": 15200
},
{
"epoch": 11.724137931034482,
"grad_norm": 1.4978444576263428,
"learning_rate": 4.2675287356321844e-05,
"loss": 12.4103,
"step": 15300
},
{
"epoch": 11.800766283524904,
"grad_norm": 1.8163210153579712,
"learning_rate": 4.262739463601533e-05,
"loss": 12.5454,
"step": 15400
},
{
"epoch": 11.877394636015326,
"grad_norm": 1.3819987773895264,
"learning_rate": 4.257950191570881e-05,
"loss": 12.5219,
"step": 15500
},
{
"epoch": 11.954022988505747,
"grad_norm": 1.6237196922302246,
"learning_rate": 4.25316091954023e-05,
"loss": 12.5876,
"step": 15600
},
{
"epoch": 12.0,
"eval_loss": 13.39963436126709,
"eval_runtime": 44.002,
"eval_samples_per_second": 29.658,
"eval_steps_per_second": 3.727,
"step": 15660
},
{
"epoch": 12.030651340996169,
"grad_norm": 1.1271090507507324,
"learning_rate": 4.2483716475095784e-05,
"loss": 12.3581,
"step": 15700
},
{
"epoch": 12.10727969348659,
"grad_norm": 1.5027310848236084,
"learning_rate": 4.243582375478927e-05,
"loss": 12.5517,
"step": 15800
},
{
"epoch": 12.183908045977011,
"grad_norm": 1.5543391704559326,
"learning_rate": 4.238793103448276e-05,
"loss": 12.7011,
"step": 15900
},
{
"epoch": 12.260536398467433,
"grad_norm": 1.7037404775619507,
"learning_rate": 4.2340038314176245e-05,
"loss": 12.289,
"step": 16000
},
{
"epoch": 12.337164750957854,
"grad_norm": 4.505245208740234,
"learning_rate": 4.229214559386974e-05,
"loss": 12.3584,
"step": 16100
},
{
"epoch": 12.413793103448276,
"grad_norm": 1.5144113302230835,
"learning_rate": 4.2244252873563225e-05,
"loss": 12.4209,
"step": 16200
},
{
"epoch": 12.490421455938698,
"grad_norm": 1.2396819591522217,
"learning_rate": 4.2196360153256706e-05,
"loss": 12.4463,
"step": 16300
},
{
"epoch": 12.567049808429118,
"grad_norm": 5.947683334350586,
"learning_rate": 4.214846743295019e-05,
"loss": 12.6401,
"step": 16400
},
{
"epoch": 12.64367816091954,
"grad_norm": 2.070812225341797,
"learning_rate": 4.210057471264368e-05,
"loss": 12.6885,
"step": 16500
},
{
"epoch": 12.720306513409962,
"grad_norm": 1.7540252208709717,
"learning_rate": 4.2052681992337166e-05,
"loss": 12.3138,
"step": 16600
},
{
"epoch": 12.796934865900383,
"grad_norm": 1.3372827768325806,
"learning_rate": 4.200478927203065e-05,
"loss": 12.8475,
"step": 16700
},
{
"epoch": 12.873563218390805,
"grad_norm": 1.6598443984985352,
"learning_rate": 4.195689655172414e-05,
"loss": 12.575,
"step": 16800
},
{
"epoch": 12.950191570881227,
"grad_norm": 1.5420461893081665,
"learning_rate": 4.190900383141763e-05,
"loss": 12.499,
"step": 16900
},
{
"epoch": 13.0,
"eval_loss": 13.359596252441406,
"eval_runtime": 43.9919,
"eval_samples_per_second": 29.665,
"eval_steps_per_second": 3.728,
"step": 16965
},
{
"epoch": 13.026819923371647,
"grad_norm": 1.785803198814392,
"learning_rate": 4.1861111111111114e-05,
"loss": 12.3123,
"step": 17000
},
{
"epoch": 13.10344827586207,
"grad_norm": 3.8619072437286377,
"learning_rate": 4.1813697318007665e-05,
"loss": 12.4633,
"step": 17100
},
{
"epoch": 13.18007662835249,
"grad_norm": 1.2189018726348877,
"learning_rate": 4.176580459770115e-05,
"loss": 12.4732,
"step": 17200
},
{
"epoch": 13.256704980842912,
"grad_norm": 3.579725742340088,
"learning_rate": 4.171791187739464e-05,
"loss": 12.3486,
"step": 17300
},
{
"epoch": 13.333333333333334,
"grad_norm": 1.258268117904663,
"learning_rate": 4.1670019157088126e-05,
"loss": 12.5506,
"step": 17400
},
{
"epoch": 13.409961685823754,
"grad_norm": 1.6867891550064087,
"learning_rate": 4.162212643678161e-05,
"loss": 12.5667,
"step": 17500
},
{
"epoch": 13.486590038314176,
"grad_norm": 1.5345897674560547,
"learning_rate": 4.15742337164751e-05,
"loss": 12.5206,
"step": 17600
},
{
"epoch": 13.563218390804598,
"grad_norm": 1.1699010133743286,
"learning_rate": 4.152634099616859e-05,
"loss": 12.3728,
"step": 17700
},
{
"epoch": 13.639846743295019,
"grad_norm": 1.669938325881958,
"learning_rate": 4.147844827586207e-05,
"loss": 12.4601,
"step": 17800
},
{
"epoch": 13.71647509578544,
"grad_norm": 1.2530852556228638,
"learning_rate": 4.1430555555555554e-05,
"loss": 12.4501,
"step": 17900
},
{
"epoch": 13.793103448275861,
"grad_norm": 1.790138840675354,
"learning_rate": 4.138266283524904e-05,
"loss": 12.467,
"step": 18000
},
{
"epoch": 13.869731800766283,
"grad_norm": 1.3373574018478394,
"learning_rate": 4.133477011494253e-05,
"loss": 12.4602,
"step": 18100
},
{
"epoch": 13.946360153256705,
"grad_norm": 1.837951898574829,
"learning_rate": 4.128687739463602e-05,
"loss": 12.4591,
"step": 18200
},
{
"epoch": 14.0,
"eval_loss": 13.289255142211914,
"eval_runtime": 43.9866,
"eval_samples_per_second": 29.668,
"eval_steps_per_second": 3.728,
"step": 18270
},
{
"epoch": 14.022988505747126,
"grad_norm": 1.540867805480957,
"learning_rate": 4.123898467432951e-05,
"loss": 12.59,
"step": 18300
},
{
"epoch": 14.099616858237548,
"grad_norm": 1.6285018920898438,
"learning_rate": 4.1191091954022995e-05,
"loss": 12.5162,
"step": 18400
},
{
"epoch": 14.17624521072797,
"grad_norm": 0.8983919620513916,
"learning_rate": 4.114319923371648e-05,
"loss": 12.4312,
"step": 18500
},
{
"epoch": 14.25287356321839,
"grad_norm": 1.7475948333740234,
"learning_rate": 4.109530651340996e-05,
"loss": 12.483,
"step": 18600
},
{
"epoch": 14.329501915708812,
"grad_norm": 1.723708987236023,
"learning_rate": 4.104741379310345e-05,
"loss": 12.5177,
"step": 18700
},
{
"epoch": 14.406130268199234,
"grad_norm": 1.3113809823989868,
"learning_rate": 4.0999521072796936e-05,
"loss": 12.3171,
"step": 18800
},
{
"epoch": 14.482758620689655,
"grad_norm": 1.7641185522079468,
"learning_rate": 4.095162835249042e-05,
"loss": 12.4669,
"step": 18900
},
{
"epoch": 14.559386973180077,
"grad_norm": 1.6181635856628418,
"learning_rate": 4.090373563218391e-05,
"loss": 12.3302,
"step": 19000
},
{
"epoch": 14.636015325670499,
"grad_norm": 1.2323795557022095,
"learning_rate": 4.0855842911877396e-05,
"loss": 12.4211,
"step": 19100
},
{
"epoch": 14.71264367816092,
"grad_norm": 1.7597166299819946,
"learning_rate": 4.080795019157088e-05,
"loss": 12.4985,
"step": 19200
},
{
"epoch": 14.789272030651341,
"grad_norm": 1.0281277894973755,
"learning_rate": 4.076005747126437e-05,
"loss": 12.5672,
"step": 19300
},
{
"epoch": 14.865900383141762,
"grad_norm": 3.3272478580474854,
"learning_rate": 4.071216475095786e-05,
"loss": 12.2671,
"step": 19400
},
{
"epoch": 14.942528735632184,
"grad_norm": 3.1264896392822266,
"learning_rate": 4.066427203065134e-05,
"loss": 12.4736,
"step": 19500
},
{
"epoch": 15.0,
"eval_loss": 13.205364227294922,
"eval_runtime": 43.9612,
"eval_samples_per_second": 29.685,
"eval_steps_per_second": 3.731,
"step": 19575
},
{
"epoch": 15.019157088122606,
"grad_norm": 1.568294882774353,
"learning_rate": 4.061637931034483e-05,
"loss": 12.4604,
"step": 19600
},
{
"epoch": 15.095785440613026,
"grad_norm": 1.919912576675415,
"learning_rate": 4.056848659003832e-05,
"loss": 12.3773,
"step": 19700
},
{
"epoch": 15.172413793103448,
"grad_norm": 1.5357537269592285,
"learning_rate": 4.0520593869731804e-05,
"loss": 12.3406,
"step": 19800
},
{
"epoch": 15.24904214559387,
"grad_norm": 1.7306512594223022,
"learning_rate": 4.0473180076628356e-05,
"loss": 12.4036,
"step": 19900
},
{
"epoch": 15.32567049808429,
"grad_norm": 1.6036773920059204,
"learning_rate": 4.0425287356321836e-05,
"loss": 12.3554,
"step": 20000
},
{
"epoch": 15.402298850574713,
"grad_norm": 1.211962342262268,
"learning_rate": 4.037739463601532e-05,
"loss": 12.5084,
"step": 20100
},
{
"epoch": 15.478927203065133,
"grad_norm": 1.4626506567001343,
"learning_rate": 4.032950191570881e-05,
"loss": 12.3593,
"step": 20200
},
{
"epoch": 15.555555555555555,
"grad_norm": 1.6557157039642334,
"learning_rate": 4.0281609195402304e-05,
"loss": 12.3249,
"step": 20300
},
{
"epoch": 15.632183908045977,
"grad_norm": 1.735300064086914,
"learning_rate": 4.023371647509579e-05,
"loss": 12.2958,
"step": 20400
},
{
"epoch": 15.708812260536398,
"grad_norm": 1.2972387075424194,
"learning_rate": 4.018582375478928e-05,
"loss": 12.4011,
"step": 20500
},
{
"epoch": 15.78544061302682,
"grad_norm": 1.2028956413269043,
"learning_rate": 4.0137931034482764e-05,
"loss": 12.3923,
"step": 20600
},
{
"epoch": 15.862068965517242,
"grad_norm": 1.9574451446533203,
"learning_rate": 4.009003831417625e-05,
"loss": 12.4927,
"step": 20700
},
{
"epoch": 15.938697318007662,
"grad_norm": 2.3753159046173096,
"learning_rate": 4.004214559386973e-05,
"loss": 12.4565,
"step": 20800
},
{
"epoch": 16.0,
"eval_loss": 13.146517753601074,
"eval_runtime": 43.956,
"eval_samples_per_second": 29.689,
"eval_steps_per_second": 3.731,
"step": 20880
},
{
"epoch": 16.015325670498083,
"grad_norm": 1.4980436563491821,
"learning_rate": 3.999425287356322e-05,
"loss": 12.4546,
"step": 20900
},
{
"epoch": 16.091954022988507,
"grad_norm": 1.2177377939224243,
"learning_rate": 3.9946360153256705e-05,
"loss": 12.3682,
"step": 21000
},
{
"epoch": 16.168582375478927,
"grad_norm": 1.9785245656967163,
"learning_rate": 3.989846743295019e-05,
"loss": 12.4315,
"step": 21100
},
{
"epoch": 16.245210727969347,
"grad_norm": 2.2773125171661377,
"learning_rate": 3.985057471264368e-05,
"loss": 12.4728,
"step": 21200
},
{
"epoch": 16.32183908045977,
"grad_norm": 1.1049697399139404,
"learning_rate": 3.9802681992337166e-05,
"loss": 12.0735,
"step": 21300
},
{
"epoch": 16.39846743295019,
"grad_norm": 2.937175750732422,
"learning_rate": 3.975478927203065e-05,
"loss": 12.4713,
"step": 21400
},
{
"epoch": 16.47509578544061,
"grad_norm": 1.058626651763916,
"learning_rate": 3.970689655172414e-05,
"loss": 12.3329,
"step": 21500
},
{
"epoch": 16.551724137931036,
"grad_norm": 2.357311248779297,
"learning_rate": 3.9659003831417626e-05,
"loss": 12.2249,
"step": 21600
},
{
"epoch": 16.628352490421456,
"grad_norm": 1.0534141063690186,
"learning_rate": 3.961111111111111e-05,
"loss": 12.4414,
"step": 21700
},
{
"epoch": 16.704980842911876,
"grad_norm": 1.5288047790527344,
"learning_rate": 3.95632183908046e-05,
"loss": 12.0682,
"step": 21800
},
{
"epoch": 16.7816091954023,
"grad_norm": 2.628070831298828,
"learning_rate": 3.951532567049809e-05,
"loss": 12.367,
"step": 21900
},
{
"epoch": 16.85823754789272,
"grad_norm": 1.4049383401870728,
"learning_rate": 3.9467432950191574e-05,
"loss": 12.1073,
"step": 22000
},
{
"epoch": 16.93486590038314,
"grad_norm": 1.8470909595489502,
"learning_rate": 3.941954022988506e-05,
"loss": 12.3757,
"step": 22100
},
{
"epoch": 17.0,
"eval_loss": 13.134416580200195,
"eval_runtime": 44.0763,
"eval_samples_per_second": 29.608,
"eval_steps_per_second": 3.721,
"step": 22185
},
{
"epoch": 17.011494252873565,
"grad_norm": 1.1388458013534546,
"learning_rate": 3.937164750957855e-05,
"loss": 12.6443,
"step": 22200
},
{
"epoch": 17.088122605363985,
"grad_norm": 1.202028512954712,
"learning_rate": 3.9323754789272034e-05,
"loss": 12.3013,
"step": 22300
},
{
"epoch": 17.164750957854405,
"grad_norm": 1.210375189781189,
"learning_rate": 3.927586206896552e-05,
"loss": 12.4812,
"step": 22400
},
{
"epoch": 17.24137931034483,
"grad_norm": 1.6550730466842651,
"learning_rate": 3.922796934865901e-05,
"loss": 12.3152,
"step": 22500
},
{
"epoch": 17.31800766283525,
"grad_norm": 1.5777093172073364,
"learning_rate": 3.918007662835249e-05,
"loss": 12.2296,
"step": 22600
},
{
"epoch": 17.39463601532567,
"grad_norm": 7.877992153167725,
"learning_rate": 3.9132183908045975e-05,
"loss": 12.4408,
"step": 22700
},
{
"epoch": 17.47126436781609,
"grad_norm": 1.6760473251342773,
"learning_rate": 3.908429118773946e-05,
"loss": 12.251,
"step": 22800
},
{
"epoch": 17.547892720306514,
"grad_norm": 2.4793410301208496,
"learning_rate": 3.903639846743295e-05,
"loss": 12.3864,
"step": 22900
},
{
"epoch": 17.624521072796934,
"grad_norm": 1.331120491027832,
"learning_rate": 3.8988505747126436e-05,
"loss": 12.0078,
"step": 23000
},
{
"epoch": 17.701149425287355,
"grad_norm": 1.1477069854736328,
"learning_rate": 3.894109195402299e-05,
"loss": 12.2234,
"step": 23100
},
{
"epoch": 17.77777777777778,
"grad_norm": 1.5665520429611206,
"learning_rate": 3.8893199233716474e-05,
"loss": 12.2716,
"step": 23200
},
{
"epoch": 17.8544061302682,
"grad_norm": 1.4720168113708496,
"learning_rate": 3.884530651340996e-05,
"loss": 12.2528,
"step": 23300
},
{
"epoch": 17.93103448275862,
"grad_norm": 1.4990317821502686,
"learning_rate": 3.879741379310345e-05,
"loss": 12.4111,
"step": 23400
},
{
"epoch": 18.0,
"eval_loss": 13.10958194732666,
"eval_runtime": 43.9884,
"eval_samples_per_second": 29.667,
"eval_steps_per_second": 3.728,
"step": 23490
},
{
"epoch": 18.007662835249043,
"grad_norm": 1.653239130973816,
"learning_rate": 3.8749521072796935e-05,
"loss": 12.4558,
"step": 23500
},
{
"epoch": 18.084291187739463,
"grad_norm": 1.3574182987213135,
"learning_rate": 3.870162835249042e-05,
"loss": 12.3242,
"step": 23600
},
{
"epoch": 18.160919540229884,
"grad_norm": 2.0138070583343506,
"learning_rate": 3.865373563218391e-05,
"loss": 12.2255,
"step": 23700
},
{
"epoch": 18.237547892720308,
"grad_norm": 1.6546958684921265,
"learning_rate": 3.8605842911877396e-05,
"loss": 12.3826,
"step": 23800
},
{
"epoch": 18.314176245210728,
"grad_norm": 1.304247498512268,
"learning_rate": 3.855795019157088e-05,
"loss": 12.1766,
"step": 23900
},
{
"epoch": 18.39080459770115,
"grad_norm": 1.109941005706787,
"learning_rate": 3.851005747126437e-05,
"loss": 12.3784,
"step": 24000
},
{
"epoch": 18.467432950191572,
"grad_norm": 4.5435872077941895,
"learning_rate": 3.8462164750957856e-05,
"loss": 12.2292,
"step": 24100
},
{
"epoch": 18.544061302681992,
"grad_norm": 2.141022205352783,
"learning_rate": 3.841427203065134e-05,
"loss": 12.2826,
"step": 24200
},
{
"epoch": 18.620689655172413,
"grad_norm": 1.6946494579315186,
"learning_rate": 3.836637931034483e-05,
"loss": 12.3012,
"step": 24300
},
{
"epoch": 18.697318007662837,
"grad_norm": 1.3159388303756714,
"learning_rate": 3.831848659003832e-05,
"loss": 12.1835,
"step": 24400
},
{
"epoch": 18.773946360153257,
"grad_norm": 2.499986410140991,
"learning_rate": 3.8270593869731804e-05,
"loss": 12.4302,
"step": 24500
},
{
"epoch": 18.850574712643677,
"grad_norm": 1.7443987131118774,
"learning_rate": 3.822270114942529e-05,
"loss": 12.5402,
"step": 24600
},
{
"epoch": 18.9272030651341,
"grad_norm": 1.4758720397949219,
"learning_rate": 3.817480842911878e-05,
"loss": 12.3978,
"step": 24700
},
{
"epoch": 19.0,
"eval_loss": 13.101744651794434,
"eval_runtime": 43.9919,
"eval_samples_per_second": 29.665,
"eval_steps_per_second": 3.728,
"step": 24795
},
{
"epoch": 19.00383141762452,
"grad_norm": 1.774843454360962,
"learning_rate": 3.812691570881226e-05,
"loss": 12.2954,
"step": 24800
},
{
"epoch": 19.080459770114942,
"grad_norm": 1.693176031112671,
"learning_rate": 3.8079022988505745e-05,
"loss": 12.3156,
"step": 24900
},
{
"epoch": 19.157088122605366,
"grad_norm": 1.3531700372695923,
"learning_rate": 3.803113026819923e-05,
"loss": 12.3989,
"step": 25000
},
{
"epoch": 19.233716475095786,
"grad_norm": 2.083587884902954,
"learning_rate": 3.798323754789272e-05,
"loss": 12.3523,
"step": 25100
},
{
"epoch": 19.310344827586206,
"grad_norm": 2.1645917892456055,
"learning_rate": 3.793534482758621e-05,
"loss": 12.0512,
"step": 25200
},
{
"epoch": 19.386973180076627,
"grad_norm": 1.8869907855987549,
"learning_rate": 3.78874521072797e-05,
"loss": 12.4837,
"step": 25300
},
{
"epoch": 19.46360153256705,
"grad_norm": 1.2421497106552124,
"learning_rate": 3.7840038314176244e-05,
"loss": 11.9937,
"step": 25400
},
{
"epoch": 19.54022988505747,
"grad_norm": 1.5155110359191895,
"learning_rate": 3.779214559386973e-05,
"loss": 12.2264,
"step": 25500
},
{
"epoch": 19.61685823754789,
"grad_norm": 1.1511332988739014,
"learning_rate": 3.774425287356322e-05,
"loss": 12.2063,
"step": 25600
},
{
"epoch": 19.693486590038315,
"grad_norm": 1.8984183073043823,
"learning_rate": 3.7696360153256705e-05,
"loss": 12.3237,
"step": 25700
},
{
"epoch": 19.770114942528735,
"grad_norm": 0.9674005508422852,
"learning_rate": 3.764846743295019e-05,
"loss": 12.1877,
"step": 25800
},
{
"epoch": 19.846743295019156,
"grad_norm": 2.0560641288757324,
"learning_rate": 3.7600574712643685e-05,
"loss": 12.2343,
"step": 25900
},
{
"epoch": 19.92337164750958,
"grad_norm": 1.3923600912094116,
"learning_rate": 3.755268199233717e-05,
"loss": 12.2683,
"step": 26000
},
{
"epoch": 20.0,
"grad_norm": 2.9314024448394775,
"learning_rate": 3.750478927203065e-05,
"loss": 12.3074,
"step": 26100
},
{
"epoch": 20.0,
"eval_loss": 13.07620906829834,
"eval_runtime": 43.9934,
"eval_samples_per_second": 29.664,
"eval_steps_per_second": 3.728,
"step": 26100
},
{
"epoch": 20.07662835249042,
"grad_norm": 1.5305142402648926,
"learning_rate": 3.745689655172414e-05,
"loss": 12.2615,
"step": 26200
},
{
"epoch": 20.153256704980844,
"grad_norm": 1.3846060037612915,
"learning_rate": 3.7409003831417626e-05,
"loss": 12.3109,
"step": 26300
},
{
"epoch": 20.229885057471265,
"grad_norm": 3.0465173721313477,
"learning_rate": 3.736111111111111e-05,
"loss": 12.258,
"step": 26400
},
{
"epoch": 20.306513409961685,
"grad_norm": 3.9723782539367676,
"learning_rate": 3.73132183908046e-05,
"loss": 12.2494,
"step": 26500
},
{
"epoch": 20.38314176245211,
"grad_norm": 1.464296817779541,
"learning_rate": 3.7265325670498086e-05,
"loss": 12.2231,
"step": 26600
},
{
"epoch": 20.45977011494253,
"grad_norm": 1.6789374351501465,
"learning_rate": 3.721743295019157e-05,
"loss": 12.3391,
"step": 26700
},
{
"epoch": 20.53639846743295,
"grad_norm": 1.1731619834899902,
"learning_rate": 3.716954022988506e-05,
"loss": 12.2172,
"step": 26800
},
{
"epoch": 20.613026819923373,
"grad_norm": 2.8839802742004395,
"learning_rate": 3.712164750957855e-05,
"loss": 12.251,
"step": 26900
},
{
"epoch": 20.689655172413794,
"grad_norm": 1.3104863166809082,
"learning_rate": 3.707375478927203e-05,
"loss": 12.4269,
"step": 27000
},
{
"epoch": 20.766283524904214,
"grad_norm": 2.5182230472564697,
"learning_rate": 3.7025862068965514e-05,
"loss": 12.1972,
"step": 27100
},
{
"epoch": 20.842911877394634,
"grad_norm": 1.4510316848754883,
"learning_rate": 3.6977969348659e-05,
"loss": 12.1446,
"step": 27200
},
{
"epoch": 20.919540229885058,
"grad_norm": 1.7377287149429321,
"learning_rate": 3.6930076628352495e-05,
"loss": 12.2374,
"step": 27300
},
{
"epoch": 20.99616858237548,
"grad_norm": 1.308686375617981,
"learning_rate": 3.6882662835249046e-05,
"loss": 12.2169,
"step": 27400
},
{
"epoch": 21.0,
"eval_loss": 13.027502059936523,
"eval_runtime": 44.025,
"eval_samples_per_second": 29.642,
"eval_steps_per_second": 3.725,
"step": 27405
},
{
"epoch": 21.0727969348659,
"grad_norm": 1.7697923183441162,
"learning_rate": 3.6834770114942526e-05,
"loss": 12.3711,
"step": 27500
},
{
"epoch": 21.149425287356323,
"grad_norm": 1.2963312864303589,
"learning_rate": 3.678687739463601e-05,
"loss": 12.1974,
"step": 27600
},
{
"epoch": 21.226053639846743,
"grad_norm": 1.617470383644104,
"learning_rate": 3.67389846743295e-05,
"loss": 12.1879,
"step": 27700
},
{
"epoch": 21.302681992337163,
"grad_norm": 2.007051944732666,
"learning_rate": 3.669109195402299e-05,
"loss": 12.2758,
"step": 27800
},
{
"epoch": 21.379310344827587,
"grad_norm": 1.4421669244766235,
"learning_rate": 3.6643199233716474e-05,
"loss": 12.1852,
"step": 27900
},
{
"epoch": 21.455938697318008,
"grad_norm": 2.678457260131836,
"learning_rate": 3.659530651340997e-05,
"loss": 12.3418,
"step": 28000
},
{
"epoch": 21.532567049808428,
"grad_norm": 1.4007712602615356,
"learning_rate": 3.6547413793103455e-05,
"loss": 12.4764,
"step": 28100
},
{
"epoch": 21.60919540229885,
"grad_norm": 4.606558322906494,
"learning_rate": 3.649952107279694e-05,
"loss": 12.2566,
"step": 28200
},
{
"epoch": 21.685823754789272,
"grad_norm": 1.354705810546875,
"learning_rate": 3.645162835249042e-05,
"loss": 12.2371,
"step": 28300
},
{
"epoch": 21.762452107279692,
"grad_norm": 1.7736151218414307,
"learning_rate": 3.640373563218391e-05,
"loss": 12.4794,
"step": 28400
},
{
"epoch": 21.839080459770116,
"grad_norm": 1.2875999212265015,
"learning_rate": 3.6355842911877395e-05,
"loss": 12.0016,
"step": 28500
},
{
"epoch": 21.915708812260537,
"grad_norm": 1.932035207748413,
"learning_rate": 3.630795019157088e-05,
"loss": 12.3018,
"step": 28600
},
{
"epoch": 21.992337164750957,
"grad_norm": 3.066443920135498,
"learning_rate": 3.626005747126437e-05,
"loss": 12.0117,
"step": 28700
},
{
"epoch": 22.0,
"eval_loss": 13.03292179107666,
"eval_runtime": 44.0005,
"eval_samples_per_second": 29.659,
"eval_steps_per_second": 3.727,
"step": 28710
},
{
"epoch": 22.06896551724138,
"grad_norm": 0.97423255443573,
"learning_rate": 3.6212164750957856e-05,
"loss": 12.4442,
"step": 28800
},
{
"epoch": 22.1455938697318,
"grad_norm": 1.7552623748779297,
"learning_rate": 3.616427203065134e-05,
"loss": 12.2976,
"step": 28900
},
{
"epoch": 22.22222222222222,
"grad_norm": 1.5857703685760498,
"learning_rate": 3.611637931034483e-05,
"loss": 12.1968,
"step": 29000
},
{
"epoch": 22.298850574712645,
"grad_norm": 1.381238341331482,
"learning_rate": 3.6068486590038317e-05,
"loss": 12.0455,
"step": 29100
},
{
"epoch": 22.375478927203066,
"grad_norm": 1.3380298614501953,
"learning_rate": 3.6020593869731803e-05,
"loss": 12.1833,
"step": 29200
},
{
"epoch": 22.452107279693486,
"grad_norm": 2.3591909408569336,
"learning_rate": 3.5972701149425284e-05,
"loss": 12.1562,
"step": 29300
},
{
"epoch": 22.52873563218391,
"grad_norm": 2.544651508331299,
"learning_rate": 3.592528735632184e-05,
"loss": 12.1318,
"step": 29400
},
{
"epoch": 22.60536398467433,
"grad_norm": 1.204476237297058,
"learning_rate": 3.587739463601533e-05,
"loss": 12.3856,
"step": 29500
},
{
"epoch": 22.68199233716475,
"grad_norm": 1.453444004058838,
"learning_rate": 3.5829501915708816e-05,
"loss": 12.0971,
"step": 29600
},
{
"epoch": 22.75862068965517,
"grad_norm": 2.287437915802002,
"learning_rate": 3.5781609195402296e-05,
"loss": 12.1294,
"step": 29700
},
{
"epoch": 22.835249042145595,
"grad_norm": 2.790942907333374,
"learning_rate": 3.573371647509578e-05,
"loss": 12.1613,
"step": 29800
},
{
"epoch": 22.911877394636015,
"grad_norm": 1.6170670986175537,
"learning_rate": 3.568582375478927e-05,
"loss": 12.0175,
"step": 29900
},
{
"epoch": 22.988505747126435,
"grad_norm": 1.724195122718811,
"learning_rate": 3.5637931034482757e-05,
"loss": 12.1815,
"step": 30000
},
{
"epoch": 23.0,
"eval_loss": 12.992958068847656,
"eval_runtime": 44.0141,
"eval_samples_per_second": 29.65,
"eval_steps_per_second": 3.726,
"step": 30015
},
{
"epoch": 23.06513409961686,
"grad_norm": 3.8932502269744873,
"learning_rate": 3.559003831417625e-05,
"loss": 12.1987,
"step": 30100
},
{
"epoch": 23.14176245210728,
"grad_norm": 1.8813198804855347,
"learning_rate": 3.554214559386974e-05,
"loss": 12.2208,
"step": 30200
},
{
"epoch": 23.2183908045977,
"grad_norm": 1.0299080610275269,
"learning_rate": 3.5494252873563224e-05,
"loss": 12.1662,
"step": 30300
},
{
"epoch": 23.295019157088124,
"grad_norm": 2.68420672416687,
"learning_rate": 3.544636015325671e-05,
"loss": 12.1013,
"step": 30400
},
{
"epoch": 23.371647509578544,
"grad_norm": 0.9587434530258179,
"learning_rate": 3.539846743295019e-05,
"loss": 12.3426,
"step": 30500
},
{
"epoch": 23.448275862068964,
"grad_norm": 1.8168953657150269,
"learning_rate": 3.535057471264368e-05,
"loss": 12.2303,
"step": 30600
},
{
"epoch": 23.52490421455939,
"grad_norm": 1.2712435722351074,
"learning_rate": 3.5302681992337165e-05,
"loss": 12.275,
"step": 30700
},
{
"epoch": 23.60153256704981,
"grad_norm": 1.0442867279052734,
"learning_rate": 3.525478927203065e-05,
"loss": 12.1344,
"step": 30800
},
{
"epoch": 23.67816091954023,
"grad_norm": 2.2171154022216797,
"learning_rate": 3.520689655172414e-05,
"loss": 12.1554,
"step": 30900
},
{
"epoch": 23.754789272030653,
"grad_norm": 1.5863583087921143,
"learning_rate": 3.5159003831417625e-05,
"loss": 12.1003,
"step": 31000
},
{
"epoch": 23.831417624521073,
"grad_norm": 1.4239143133163452,
"learning_rate": 3.511111111111111e-05,
"loss": 12.1271,
"step": 31100
},
{
"epoch": 23.908045977011493,
"grad_norm": 2.044018030166626,
"learning_rate": 3.50632183908046e-05,
"loss": 12.3269,
"step": 31200
},
{
"epoch": 23.984674329501917,
"grad_norm": 2.9049460887908936,
"learning_rate": 3.5015325670498086e-05,
"loss": 12.0403,
"step": 31300
},
{
"epoch": 24.0,
"eval_loss": 13.009976387023926,
"eval_runtime": 44.0062,
"eval_samples_per_second": 29.655,
"eval_steps_per_second": 3.727,
"step": 31320
},
{
"epoch": 24.061302681992338,
"grad_norm": 1.4207292795181274,
"learning_rate": 3.496743295019157e-05,
"loss": 12.0634,
"step": 31400
},
{
"epoch": 24.137931034482758,
"grad_norm": 1.886399269104004,
"learning_rate": 3.491954022988506e-05,
"loss": 12.1573,
"step": 31500
},
{
"epoch": 24.21455938697318,
"grad_norm": 2.239217519760132,
"learning_rate": 3.487164750957855e-05,
"loss": 12.3025,
"step": 31600
},
{
"epoch": 24.291187739463602,
"grad_norm": 1.495377540588379,
"learning_rate": 3.4823754789272034e-05,
"loss": 12.1236,
"step": 31700
},
{
"epoch": 24.367816091954023,
"grad_norm": 1.4570187330245972,
"learning_rate": 3.477586206896552e-05,
"loss": 12.1341,
"step": 31800
},
{
"epoch": 24.444444444444443,
"grad_norm": 1.137839674949646,
"learning_rate": 3.472796934865901e-05,
"loss": 12.1097,
"step": 31900
},
{
"epoch": 24.521072796934867,
"grad_norm": 1.9981390237808228,
"learning_rate": 3.4680076628352494e-05,
"loss": 12.4374,
"step": 32000
},
{
"epoch": 24.597701149425287,
"grad_norm": 1.6802810430526733,
"learning_rate": 3.463218390804598e-05,
"loss": 12.0851,
"step": 32100
},
{
"epoch": 24.674329501915707,
"grad_norm": 2.0081875324249268,
"learning_rate": 3.458429118773947e-05,
"loss": 12.0883,
"step": 32200
},
{
"epoch": 24.75095785440613,
"grad_norm": 2.637779474258423,
"learning_rate": 3.453639846743295e-05,
"loss": 12.198,
"step": 32300
},
{
"epoch": 24.82758620689655,
"grad_norm": 6.473161220550537,
"learning_rate": 3.4488505747126435e-05,
"loss": 12.1459,
"step": 32400
},
{
"epoch": 24.904214559386972,
"grad_norm": 1.3531584739685059,
"learning_rate": 3.444061302681992e-05,
"loss": 12.0297,
"step": 32500
},
{
"epoch": 24.980842911877396,
"grad_norm": 1.2492320537567139,
"learning_rate": 3.439272030651341e-05,
"loss": 12.0907,
"step": 32600
},
{
"epoch": 25.0,
"eval_loss": 12.98237419128418,
"eval_runtime": 44.0055,
"eval_samples_per_second": 29.655,
"eval_steps_per_second": 3.727,
"step": 32625
},
{
"epoch": 25.057471264367816,
"grad_norm": 1.2564047574996948,
"learning_rate": 3.4344827586206896e-05,
"loss": 12.3271,
"step": 32700
},
{
"epoch": 25.134099616858236,
"grad_norm": 1.6601101160049438,
"learning_rate": 3.429741379310345e-05,
"loss": 12.2568,
"step": 32800
},
{
"epoch": 25.21072796934866,
"grad_norm": 1.8177669048309326,
"learning_rate": 3.4249521072796934e-05,
"loss": 12.2059,
"step": 32900
},
{
"epoch": 25.28735632183908,
"grad_norm": 1.5476176738739014,
"learning_rate": 3.420162835249042e-05,
"loss": 12.2871,
"step": 33000
},
{
"epoch": 25.3639846743295,
"grad_norm": 1.305198073387146,
"learning_rate": 3.415373563218391e-05,
"loss": 12.258,
"step": 33100
},
{
"epoch": 25.440613026819925,
"grad_norm": 5.837198257446289,
"learning_rate": 3.4105842911877395e-05,
"loss": 12.0855,
"step": 33200
},
{
"epoch": 25.517241379310345,
"grad_norm": 2.148789882659912,
"learning_rate": 3.405795019157088e-05,
"loss": 12.1539,
"step": 33300
},
{
"epoch": 25.593869731800766,
"grad_norm": 1.8985601663589478,
"learning_rate": 3.401005747126437e-05,
"loss": 12.2977,
"step": 33400
},
{
"epoch": 25.67049808429119,
"grad_norm": 1.9121934175491333,
"learning_rate": 3.3962164750957855e-05,
"loss": 12.0616,
"step": 33500
},
{
"epoch": 25.74712643678161,
"grad_norm": 1.3972700834274292,
"learning_rate": 3.391427203065134e-05,
"loss": 12.0951,
"step": 33600
},
{
"epoch": 25.82375478927203,
"grad_norm": 1.3285768032073975,
"learning_rate": 3.386637931034483e-05,
"loss": 12.0531,
"step": 33700
},
{
"epoch": 25.900383141762454,
"grad_norm": 2.199030876159668,
"learning_rate": 3.3818486590038316e-05,
"loss": 11.9635,
"step": 33800
},
{
"epoch": 25.977011494252874,
"grad_norm": 1.0486905574798584,
"learning_rate": 3.37705938697318e-05,
"loss": 11.9477,
"step": 33900
},
{
"epoch": 26.0,
"eval_loss": 12.954750061035156,
"eval_runtime": 44.0151,
"eval_samples_per_second": 29.649,
"eval_steps_per_second": 3.726,
"step": 33930
},
{
"epoch": 26.053639846743295,
"grad_norm": 1.8525198698043823,
"learning_rate": 3.372270114942529e-05,
"loss": 11.9857,
"step": 34000
},
{
"epoch": 26.130268199233715,
"grad_norm": 1.4454785585403442,
"learning_rate": 3.367480842911878e-05,
"loss": 11.8142,
"step": 34100
},
{
"epoch": 26.20689655172414,
"grad_norm": 1.6828280687332153,
"learning_rate": 3.3626915708812264e-05,
"loss": 11.9359,
"step": 34200
},
{
"epoch": 26.28352490421456,
"grad_norm": 1.898542046546936,
"learning_rate": 3.357902298850575e-05,
"loss": 12.3808,
"step": 34300
},
{
"epoch": 26.36015325670498,
"grad_norm": 1.3259601593017578,
"learning_rate": 3.353113026819924e-05,
"loss": 11.9188,
"step": 34400
},
{
"epoch": 26.436781609195403,
"grad_norm": 1.2543106079101562,
"learning_rate": 3.348323754789272e-05,
"loss": 12.2622,
"step": 34500
},
{
"epoch": 26.513409961685824,
"grad_norm": 1.1741349697113037,
"learning_rate": 3.3435344827586204e-05,
"loss": 12.3296,
"step": 34600
},
{
"epoch": 26.590038314176244,
"grad_norm": 2.937052011489868,
"learning_rate": 3.338745210727969e-05,
"loss": 12.0383,
"step": 34700
},
{
"epoch": 26.666666666666668,
"grad_norm": 1.5736559629440308,
"learning_rate": 3.333955938697318e-05,
"loss": 12.178,
"step": 34800
},
{
"epoch": 26.743295019157088,
"grad_norm": 1.9110735654830933,
"learning_rate": 3.329214559386974e-05,
"loss": 12.223,
"step": 34900
},
{
"epoch": 26.81992337164751,
"grad_norm": 0.9110540747642517,
"learning_rate": 3.324425287356322e-05,
"loss": 12.1191,
"step": 35000
},
{
"epoch": 26.896551724137932,
"grad_norm": 1.3772426843643188,
"learning_rate": 3.3196360153256704e-05,
"loss": 12.1527,
"step": 35100
},
{
"epoch": 26.973180076628353,
"grad_norm": 1.5747685432434082,
"learning_rate": 3.314846743295019e-05,
"loss": 12.093,
"step": 35200
},
{
"epoch": 27.0,
"eval_loss": 12.915553092956543,
"eval_runtime": 44.0197,
"eval_samples_per_second": 29.646,
"eval_steps_per_second": 3.726,
"step": 35235
},
{
"epoch": 27.049808429118773,
"grad_norm": 1.285940408706665,
"learning_rate": 3.310057471264368e-05,
"loss": 12.1302,
"step": 35300
},
{
"epoch": 27.126436781609197,
"grad_norm": 1.3924872875213623,
"learning_rate": 3.3052681992337164e-05,
"loss": 12.2251,
"step": 35400
},
{
"epoch": 27.203065134099617,
"grad_norm": 3.2285568714141846,
"learning_rate": 3.300478927203065e-05,
"loss": 12.1551,
"step": 35500
},
{
"epoch": 27.279693486590038,
"grad_norm": 1.9970892667770386,
"learning_rate": 3.295689655172414e-05,
"loss": 12.1276,
"step": 35600
},
{
"epoch": 27.35632183908046,
"grad_norm": 1.5273020267486572,
"learning_rate": 3.290900383141763e-05,
"loss": 12.3051,
"step": 35700
},
{
"epoch": 27.43295019157088,
"grad_norm": 1.3356541395187378,
"learning_rate": 3.286111111111111e-05,
"loss": 12.1591,
"step": 35800
},
{
"epoch": 27.509578544061302,
"grad_norm": 1.1603785753250122,
"learning_rate": 3.28132183908046e-05,
"loss": 11.9451,
"step": 35900
},
{
"epoch": 27.586206896551722,
"grad_norm": 1.2263092994689941,
"learning_rate": 3.2765325670498086e-05,
"loss": 12.069,
"step": 36000
},
{
"epoch": 27.662835249042146,
"grad_norm": 2.639704465866089,
"learning_rate": 3.271743295019157e-05,
"loss": 12.0213,
"step": 36100
},
{
"epoch": 27.739463601532567,
"grad_norm": 1.1907585859298706,
"learning_rate": 3.266954022988506e-05,
"loss": 12.0336,
"step": 36200
},
{
"epoch": 27.816091954022987,
"grad_norm": 2.5226128101348877,
"learning_rate": 3.2621647509578546e-05,
"loss": 12.1515,
"step": 36300
},
{
"epoch": 27.89272030651341,
"grad_norm": 1.263527274131775,
"learning_rate": 3.257375478927203e-05,
"loss": 12.067,
"step": 36400
},
{
"epoch": 27.96934865900383,
"grad_norm": 1.636793613433838,
"learning_rate": 3.252586206896552e-05,
"loss": 12.14,
"step": 36500
},
{
"epoch": 28.0,
"eval_loss": 12.91286563873291,
"eval_runtime": 44.033,
"eval_samples_per_second": 29.637,
"eval_steps_per_second": 3.724,
"step": 36540
},
{
"epoch": 28.04597701149425,
"grad_norm": 1.691573977470398,
"learning_rate": 3.247796934865901e-05,
"loss": 12.0472,
"step": 36600
},
{
"epoch": 28.122605363984675,
"grad_norm": 2.2020788192749023,
"learning_rate": 3.2430076628352494e-05,
"loss": 12.0171,
"step": 36700
},
{
"epoch": 28.199233716475096,
"grad_norm": 1.9675192832946777,
"learning_rate": 3.2382183908045974e-05,
"loss": 12.1335,
"step": 36800
},
{
"epoch": 28.275862068965516,
"grad_norm": 2.210883378982544,
"learning_rate": 3.233429118773946e-05,
"loss": 12.065,
"step": 36900
},
{
"epoch": 28.35249042145594,
"grad_norm": 1.4574834108352661,
"learning_rate": 3.2286398467432954e-05,
"loss": 12.0635,
"step": 37000
},
{
"epoch": 28.42911877394636,
"grad_norm": 2.1000685691833496,
"learning_rate": 3.223850574712644e-05,
"loss": 12.2908,
"step": 37100
},
{
"epoch": 28.50574712643678,
"grad_norm": 2.088956832885742,
"learning_rate": 3.2191091954022986e-05,
"loss": 12.2421,
"step": 37200
},
{
"epoch": 28.582375478927204,
"grad_norm": 1.5785751342773438,
"learning_rate": 3.214319923371647e-05,
"loss": 12.0568,
"step": 37300
},
{
"epoch": 28.659003831417625,
"grad_norm": 1.5230878591537476,
"learning_rate": 3.209530651340996e-05,
"loss": 12.0995,
"step": 37400
},
{
"epoch": 28.735632183908045,
"grad_norm": 1.1175010204315186,
"learning_rate": 3.204741379310345e-05,
"loss": 12.17,
"step": 37500
},
{
"epoch": 28.81226053639847,
"grad_norm": 1.6524131298065186,
"learning_rate": 3.1999521072796934e-05,
"loss": 12.1192,
"step": 37600
},
{
"epoch": 28.88888888888889,
"grad_norm": 1.5143946409225464,
"learning_rate": 3.195162835249042e-05,
"loss": 11.9995,
"step": 37700
},
{
"epoch": 28.96551724137931,
"grad_norm": 1.2787953615188599,
"learning_rate": 3.1903735632183914e-05,
"loss": 12.0876,
"step": 37800
},
{
"epoch": 29.0,
"eval_loss": 12.9454984664917,
"eval_runtime": 44.0594,
"eval_samples_per_second": 29.619,
"eval_steps_per_second": 3.722,
"step": 37845
},
{
"epoch": 29.042145593869733,
"grad_norm": 1.4434622526168823,
"learning_rate": 3.18558429118774e-05,
"loss": 11.8509,
"step": 37900
},
{
"epoch": 29.118773946360154,
"grad_norm": 1.2989375591278076,
"learning_rate": 3.180795019157088e-05,
"loss": 12.1473,
"step": 38000
},
{
"epoch": 29.195402298850574,
"grad_norm": 1.6747602224349976,
"learning_rate": 3.176005747126437e-05,
"loss": 12.1781,
"step": 38100
},
{
"epoch": 29.272030651340994,
"grad_norm": 2.2328062057495117,
"learning_rate": 3.1712164750957855e-05,
"loss": 12.2881,
"step": 38200
},
{
"epoch": 29.34865900383142,
"grad_norm": 2.3226537704467773,
"learning_rate": 3.166427203065134e-05,
"loss": 12.0132,
"step": 38300
},
{
"epoch": 29.42528735632184,
"grad_norm": 1.7786709070205688,
"learning_rate": 3.161637931034483e-05,
"loss": 12.2086,
"step": 38400
},
{
"epoch": 29.50191570881226,
"grad_norm": 2.359247922897339,
"learning_rate": 3.1568486590038316e-05,
"loss": 12.3037,
"step": 38500
},
{
"epoch": 29.578544061302683,
"grad_norm": 1.661720633506775,
"learning_rate": 3.15205938697318e-05,
"loss": 11.9945,
"step": 38600
},
{
"epoch": 29.655172413793103,
"grad_norm": 1.2464226484298706,
"learning_rate": 3.147270114942529e-05,
"loss": 12.0475,
"step": 38700
},
{
"epoch": 29.731800766283524,
"grad_norm": 5.234483242034912,
"learning_rate": 3.1424808429118776e-05,
"loss": 12.1442,
"step": 38800
},
{
"epoch": 29.808429118773947,
"grad_norm": 1.2800259590148926,
"learning_rate": 3.137691570881226e-05,
"loss": 11.923,
"step": 38900
},
{
"epoch": 29.885057471264368,
"grad_norm": 1.3353965282440186,
"learning_rate": 3.132902298850574e-05,
"loss": 12.0991,
"step": 39000
},
{
"epoch": 29.961685823754788,
"grad_norm": 1.974084734916687,
"learning_rate": 3.128113026819924e-05,
"loss": 12.0987,
"step": 39100
},
{
"epoch": 30.0,
"eval_loss": 12.926346778869629,
"eval_runtime": 44.1327,
"eval_samples_per_second": 29.57,
"eval_steps_per_second": 3.716,
"step": 39150
},
{
"epoch": 30.038314176245212,
"grad_norm": 2.184515953063965,
"learning_rate": 3.1233237547892724e-05,
"loss": 11.9969,
"step": 39200
},
{
"epoch": 30.114942528735632,
"grad_norm": 3.448138952255249,
"learning_rate": 3.1185823754789276e-05,
"loss": 12.2465,
"step": 39300
},
{
"epoch": 30.191570881226053,
"grad_norm": 1.5382182598114014,
"learning_rate": 3.113793103448276e-05,
"loss": 12.1218,
"step": 39400
},
{
"epoch": 30.268199233716476,
"grad_norm": 1.4232020378112793,
"learning_rate": 3.109003831417624e-05,
"loss": 12.0744,
"step": 39500
},
{
"epoch": 30.344827586206897,
"grad_norm": 1.130115270614624,
"learning_rate": 3.104214559386973e-05,
"loss": 11.982,
"step": 39600
},
{
"epoch": 30.421455938697317,
"grad_norm": 0.9410238265991211,
"learning_rate": 3.0994252873563216e-05,
"loss": 11.9721,
"step": 39700
},
{
"epoch": 30.49808429118774,
"grad_norm": 1.6789051294326782,
"learning_rate": 3.09463601532567e-05,
"loss": 12.2021,
"step": 39800
},
{
"epoch": 30.57471264367816,
"grad_norm": 1.7361513376235962,
"learning_rate": 3.08984674329502e-05,
"loss": 12.1236,
"step": 39900
},
{
"epoch": 30.65134099616858,
"grad_norm": 1.868490219116211,
"learning_rate": 3.0850574712643684e-05,
"loss": 12.0632,
"step": 40000
},
{
"epoch": 30.727969348659006,
"grad_norm": 1.3586502075195312,
"learning_rate": 3.080268199233717e-05,
"loss": 12.0715,
"step": 40100
},
{
"epoch": 30.804597701149426,
"grad_norm": 1.6496648788452148,
"learning_rate": 3.075478927203066e-05,
"loss": 12.0989,
"step": 40200
},
{
"epoch": 30.881226053639846,
"grad_norm": 1.8671578168869019,
"learning_rate": 3.070689655172414e-05,
"loss": 11.996,
"step": 40300
},
{
"epoch": 30.957854406130267,
"grad_norm": 0.9875293374061584,
"learning_rate": 3.0659003831417624e-05,
"loss": 12.0908,
"step": 40400
},
{
"epoch": 31.0,
"eval_loss": 12.88086986541748,
"eval_runtime": 44.1375,
"eval_samples_per_second": 29.567,
"eval_steps_per_second": 3.716,
"step": 40455
},
{
"epoch": 31.03448275862069,
"grad_norm": 4.194854259490967,
"learning_rate": 3.061111111111111e-05,
"loss": 12.0422,
"step": 40500
},
{
"epoch": 31.11111111111111,
"grad_norm": 1.550528883934021,
"learning_rate": 3.05632183908046e-05,
"loss": 12.2051,
"step": 40600
},
{
"epoch": 31.18773946360153,
"grad_norm": 2.011462450027466,
"learning_rate": 3.0515325670498085e-05,
"loss": 12.1084,
"step": 40700
},
{
"epoch": 31.264367816091955,
"grad_norm": 1.100541114807129,
"learning_rate": 3.0467432950191572e-05,
"loss": 11.9174,
"step": 40800
},
{
"epoch": 31.340996168582375,
"grad_norm": 1.1993151903152466,
"learning_rate": 3.041954022988506e-05,
"loss": 12.0801,
"step": 40900
},
{
"epoch": 31.417624521072796,
"grad_norm": 1.501018762588501,
"learning_rate": 3.0371647509578542e-05,
"loss": 12.1011,
"step": 41000
},
{
"epoch": 31.49425287356322,
"grad_norm": 1.788327932357788,
"learning_rate": 3.032375478927203e-05,
"loss": 12.192,
"step": 41100
},
{
"epoch": 31.57088122605364,
"grad_norm": 1.7562750577926636,
"learning_rate": 3.0275862068965523e-05,
"loss": 11.829,
"step": 41200
},
{
"epoch": 31.64750957854406,
"grad_norm": 1.467976450920105,
"learning_rate": 3.0227969348659006e-05,
"loss": 12.0685,
"step": 41300
},
{
"epoch": 31.724137931034484,
"grad_norm": 2.4010770320892334,
"learning_rate": 3.0180076628352493e-05,
"loss": 12.0806,
"step": 41400
},
{
"epoch": 31.800766283524904,
"grad_norm": 1.759490728378296,
"learning_rate": 3.013218390804598e-05,
"loss": 12.1422,
"step": 41500
},
{
"epoch": 31.877394636015325,
"grad_norm": 1.6164530515670776,
"learning_rate": 3.0084291187739467e-05,
"loss": 12.0766,
"step": 41600
},
{
"epoch": 31.95402298850575,
"grad_norm": 1.3001078367233276,
"learning_rate": 3.0036398467432954e-05,
"loss": 12.0244,
"step": 41700
},
{
"epoch": 32.0,
"eval_loss": 12.876104354858398,
"eval_runtime": 44.1527,
"eval_samples_per_second": 29.557,
"eval_steps_per_second": 3.714,
"step": 41760
},
{
"epoch": 32.030651340996165,
"grad_norm": 1.1984444856643677,
"learning_rate": 2.9988505747126437e-05,
"loss": 12.1453,
"step": 41800
},
{
"epoch": 32.10727969348659,
"grad_norm": 0.9655357599258423,
"learning_rate": 2.9941091954022986e-05,
"loss": 11.8735,
"step": 41900
},
{
"epoch": 32.18390804597701,
"grad_norm": 1.0667262077331543,
"learning_rate": 2.989319923371648e-05,
"loss": 12.1566,
"step": 42000
},
{
"epoch": 32.26053639846743,
"grad_norm": 1.6131408214569092,
"learning_rate": 2.9845306513409966e-05,
"loss": 11.9729,
"step": 42100
},
{
"epoch": 32.337164750957854,
"grad_norm": 1.6158314943313599,
"learning_rate": 2.979741379310345e-05,
"loss": 12.0362,
"step": 42200
},
{
"epoch": 32.41379310344828,
"grad_norm": 1.189818263053894,
"learning_rate": 2.9749521072796937e-05,
"loss": 12.2135,
"step": 42300
},
{
"epoch": 32.490421455938694,
"grad_norm": 2.628614664077759,
"learning_rate": 2.9701628352490423e-05,
"loss": 12.032,
"step": 42400
},
{
"epoch": 32.56704980842912,
"grad_norm": 1.6809107065200806,
"learning_rate": 2.965373563218391e-05,
"loss": 11.81,
"step": 42500
},
{
"epoch": 32.64367816091954,
"grad_norm": 1.6311430931091309,
"learning_rate": 2.9605842911877397e-05,
"loss": 11.9348,
"step": 42600
},
{
"epoch": 32.72030651340996,
"grad_norm": 1.2387199401855469,
"learning_rate": 2.955795019157088e-05,
"loss": 12.0694,
"step": 42700
},
{
"epoch": 32.79693486590038,
"grad_norm": 1.7171186208724976,
"learning_rate": 2.9510057471264368e-05,
"loss": 11.9729,
"step": 42800
},
{
"epoch": 32.87356321839081,
"grad_norm": 1.6134984493255615,
"learning_rate": 2.9462164750957854e-05,
"loss": 12.1292,
"step": 42900
},
{
"epoch": 32.95019157088122,
"grad_norm": 2.2401788234710693,
"learning_rate": 2.941427203065134e-05,
"loss": 12.1613,
"step": 43000
},
{
"epoch": 33.0,
"eval_loss": 12.873848915100098,
"eval_runtime": 44.126,
"eval_samples_per_second": 29.574,
"eval_steps_per_second": 3.717,
"step": 43065
},
{
"epoch": 33.02681992337165,
"grad_norm": 1.260538935661316,
"learning_rate": 2.9366379310344828e-05,
"loss": 12.1855,
"step": 43100
},
{
"epoch": 33.10344827586207,
"grad_norm": 1.7840496301651,
"learning_rate": 2.9318486590038312e-05,
"loss": 12.0618,
"step": 43200
},
{
"epoch": 33.18007662835249,
"grad_norm": 1.162712574005127,
"learning_rate": 2.92705938697318e-05,
"loss": 12.2513,
"step": 43300
},
{
"epoch": 33.25670498084291,
"grad_norm": 3.618567705154419,
"learning_rate": 2.9222701149425292e-05,
"loss": 12.0614,
"step": 43400
},
{
"epoch": 33.333333333333336,
"grad_norm": 1.2605602741241455,
"learning_rate": 2.9174808429118776e-05,
"loss": 11.9763,
"step": 43500
},
{
"epoch": 33.40996168582375,
"grad_norm": 1.4304360151290894,
"learning_rate": 2.9126915708812263e-05,
"loss": 12.1044,
"step": 43600
},
{
"epoch": 33.486590038314176,
"grad_norm": 1.1767237186431885,
"learning_rate": 2.907902298850575e-05,
"loss": 11.8996,
"step": 43700
},
{
"epoch": 33.5632183908046,
"grad_norm": 1.6173638105392456,
"learning_rate": 2.9031130268199236e-05,
"loss": 11.969,
"step": 43800
},
{
"epoch": 33.63984674329502,
"grad_norm": 1.2231945991516113,
"learning_rate": 2.8983237547892723e-05,
"loss": 12.2301,
"step": 43900
},
{
"epoch": 33.71647509578544,
"grad_norm": 3.853048801422119,
"learning_rate": 2.8935344827586207e-05,
"loss": 11.9726,
"step": 44000
},
{
"epoch": 33.793103448275865,
"grad_norm": 1.4259275197982788,
"learning_rate": 2.8887452107279694e-05,
"loss": 11.9545,
"step": 44100
},
{
"epoch": 33.86973180076628,
"grad_norm": 2.5803606510162354,
"learning_rate": 2.883955938697318e-05,
"loss": 11.8867,
"step": 44200
},
{
"epoch": 33.946360153256705,
"grad_norm": 1.3688091039657593,
"learning_rate": 2.8791666666666667e-05,
"loss": 12.0033,
"step": 44300
},
{
"epoch": 34.0,
"eval_loss": 12.871088027954102,
"eval_runtime": 44.1202,
"eval_samples_per_second": 29.578,
"eval_steps_per_second": 3.717,
"step": 44370
},
{
"epoch": 34.02298850574713,
"grad_norm": 1.947970986366272,
"learning_rate": 2.8743773946360154e-05,
"loss": 11.9572,
"step": 44400
},
{
"epoch": 34.099616858237546,
"grad_norm": 1.9568095207214355,
"learning_rate": 2.8696360153256706e-05,
"loss": 12.0624,
"step": 44500
},
{
"epoch": 34.17624521072797,
"grad_norm": 1.4037648439407349,
"learning_rate": 2.8648467432950193e-05,
"loss": 11.8426,
"step": 44600
},
{
"epoch": 34.252873563218394,
"grad_norm": 2.5989620685577393,
"learning_rate": 2.860057471264368e-05,
"loss": 11.9217,
"step": 44700
},
{
"epoch": 34.32950191570881,
"grad_norm": 1.3627197742462158,
"learning_rate": 2.8552681992337167e-05,
"loss": 11.9418,
"step": 44800
},
{
"epoch": 34.406130268199234,
"grad_norm": 1.4087576866149902,
"learning_rate": 2.8504789272030654e-05,
"loss": 12.1608,
"step": 44900
},
{
"epoch": 34.48275862068966,
"grad_norm": 1.4856873750686646,
"learning_rate": 2.8456896551724137e-05,
"loss": 11.9778,
"step": 45000
},
{
"epoch": 34.559386973180075,
"grad_norm": 1.631663203239441,
"learning_rate": 2.8409003831417624e-05,
"loss": 12.0547,
"step": 45100
},
{
"epoch": 34.6360153256705,
"grad_norm": 2.1117138862609863,
"learning_rate": 2.836111111111111e-05,
"loss": 12.0824,
"step": 45200
},
{
"epoch": 34.71264367816092,
"grad_norm": 1.9915541410446167,
"learning_rate": 2.8313218390804598e-05,
"loss": 12.0984,
"step": 45300
},
{
"epoch": 34.78927203065134,
"grad_norm": 2.4851934909820557,
"learning_rate": 2.8265325670498085e-05,
"loss": 12.0646,
"step": 45400
},
{
"epoch": 34.86590038314176,
"grad_norm": 1.1414411067962646,
"learning_rate": 2.8217432950191575e-05,
"loss": 12.0986,
"step": 45500
},
{
"epoch": 34.94252873563218,
"grad_norm": 1.0578815937042236,
"learning_rate": 2.8169540229885062e-05,
"loss": 12.1035,
"step": 45600
},
{
"epoch": 35.0,
"eval_loss": 12.84704875946045,
"eval_runtime": 44.1331,
"eval_samples_per_second": 29.57,
"eval_steps_per_second": 3.716,
"step": 45675
},
{
"epoch": 35.019157088122604,
"grad_norm": 1.2231003046035767,
"learning_rate": 2.812164750957855e-05,
"loss": 12.2043,
"step": 45700
},
{
"epoch": 35.09578544061303,
"grad_norm": 1.6044613122940063,
"learning_rate": 2.8073754789272032e-05,
"loss": 11.9987,
"step": 45800
},
{
"epoch": 35.172413793103445,
"grad_norm": 1.208008050918579,
"learning_rate": 2.802586206896552e-05,
"loss": 11.7725,
"step": 45900
},
{
"epoch": 35.24904214559387,
"grad_norm": 1.8152436017990112,
"learning_rate": 2.7977969348659006e-05,
"loss": 11.9232,
"step": 46000
},
{
"epoch": 35.32567049808429,
"grad_norm": 0.9535597562789917,
"learning_rate": 2.7930076628352493e-05,
"loss": 12.2091,
"step": 46100
},
{
"epoch": 35.40229885057471,
"grad_norm": 1.5778999328613281,
"learning_rate": 2.7882183908045976e-05,
"loss": 12.0968,
"step": 46200
},
{
"epoch": 35.47892720306513,
"grad_norm": 1.5384963750839233,
"learning_rate": 2.7834291187739463e-05,
"loss": 12.1058,
"step": 46300
},
{
"epoch": 35.55555555555556,
"grad_norm": 1.1971815824508667,
"learning_rate": 2.778639846743295e-05,
"loss": 12.048,
"step": 46400
},
{
"epoch": 35.632183908045974,
"grad_norm": 1.2047299146652222,
"learning_rate": 2.7738505747126437e-05,
"loss": 12.0413,
"step": 46500
},
{
"epoch": 35.7088122605364,
"grad_norm": 1.6629399061203003,
"learning_rate": 2.7690613026819924e-05,
"loss": 11.9562,
"step": 46600
},
{
"epoch": 35.78544061302682,
"grad_norm": 1.8731905221939087,
"learning_rate": 2.7642720306513407e-05,
"loss": 12.0334,
"step": 46700
},
{
"epoch": 35.86206896551724,
"grad_norm": 1.5753523111343384,
"learning_rate": 2.75948275862069e-05,
"loss": 11.9348,
"step": 46800
},
{
"epoch": 35.93869731800766,
"grad_norm": 2.0848851203918457,
"learning_rate": 2.7546934865900388e-05,
"loss": 12.0199,
"step": 46900
},
{
"epoch": 36.0,
"eval_loss": 12.837443351745605,
"eval_runtime": 44.1529,
"eval_samples_per_second": 29.556,
"eval_steps_per_second": 3.714,
"step": 46980
},
{
"epoch": 36.015325670498086,
"grad_norm": 1.3191312551498413,
"learning_rate": 2.749904214559387e-05,
"loss": 12.1034,
"step": 47000
},
{
"epoch": 36.0919540229885,
"grad_norm": 1.8107291460037231,
"learning_rate": 2.7451149425287358e-05,
"loss": 11.9679,
"step": 47100
},
{
"epoch": 36.16858237547893,
"grad_norm": 2.29463529586792,
"learning_rate": 2.7403735632183906e-05,
"loss": 11.7111,
"step": 47200
},
{
"epoch": 36.24521072796935,
"grad_norm": 1.3297805786132812,
"learning_rate": 2.7355842911877393e-05,
"loss": 11.8913,
"step": 47300
},
{
"epoch": 36.32183908045977,
"grad_norm": 1.1663862466812134,
"learning_rate": 2.730795019157088e-05,
"loss": 12.0487,
"step": 47400
},
{
"epoch": 36.39846743295019,
"grad_norm": 1.4846138954162598,
"learning_rate": 2.7260057471264367e-05,
"loss": 12.1661,
"step": 47500
},
{
"epoch": 36.475095785440615,
"grad_norm": 1.8800255060195923,
"learning_rate": 2.7212164750957857e-05,
"loss": 11.9248,
"step": 47600
},
{
"epoch": 36.55172413793103,
"grad_norm": 1.7427587509155273,
"learning_rate": 2.7164272030651344e-05,
"loss": 12.0681,
"step": 47700
},
{
"epoch": 36.628352490421456,
"grad_norm": 2.0017685890197754,
"learning_rate": 2.711637931034483e-05,
"loss": 12.2556,
"step": 47800
},
{
"epoch": 36.70498084291188,
"grad_norm": 2.765782117843628,
"learning_rate": 2.7068486590038318e-05,
"loss": 11.8846,
"step": 47900
},
{
"epoch": 36.7816091954023,
"grad_norm": 1.519728422164917,
"learning_rate": 2.70205938697318e-05,
"loss": 12.0119,
"step": 48000
},
{
"epoch": 36.85823754789272,
"grad_norm": 1.091073989868164,
"learning_rate": 2.697270114942529e-05,
"loss": 12.1197,
"step": 48100
},
{
"epoch": 36.934865900383144,
"grad_norm": 1.3182342052459717,
"learning_rate": 2.6924808429118775e-05,
"loss": 12.0217,
"step": 48200
},
{
"epoch": 37.0,
"eval_loss": 12.849996566772461,
"eval_runtime": 44.1316,
"eval_samples_per_second": 29.571,
"eval_steps_per_second": 3.716,
"step": 48285
},
{
"epoch": 37.01149425287356,
"grad_norm": 1.9082536697387695,
"learning_rate": 2.6876915708812262e-05,
"loss": 12.2391,
"step": 48300
},
{
"epoch": 37.088122605363985,
"grad_norm": 1.5705393552780151,
"learning_rate": 2.682902298850575e-05,
"loss": 12.1329,
"step": 48400
},
{
"epoch": 37.16475095785441,
"grad_norm": 2.2240869998931885,
"learning_rate": 2.6781130268199233e-05,
"loss": 12.108,
"step": 48500
},
{
"epoch": 37.241379310344826,
"grad_norm": 1.357383370399475,
"learning_rate": 2.673323754789272e-05,
"loss": 11.9599,
"step": 48600
},
{
"epoch": 37.31800766283525,
"grad_norm": 2.1634521484375,
"learning_rate": 2.6685344827586206e-05,
"loss": 12.0339,
"step": 48700
},
{
"epoch": 37.39463601532567,
"grad_norm": 1.611195683479309,
"learning_rate": 2.6637452107279693e-05,
"loss": 12.0276,
"step": 48800
},
{
"epoch": 37.47126436781609,
"grad_norm": 1.3676810264587402,
"learning_rate": 2.6589559386973183e-05,
"loss": 11.9487,
"step": 48900
},
{
"epoch": 37.547892720306514,
"grad_norm": 1.4503991603851318,
"learning_rate": 2.654166666666667e-05,
"loss": 11.9166,
"step": 49000
},
{
"epoch": 37.62452107279694,
"grad_norm": 2.0941789150238037,
"learning_rate": 2.6493773946360157e-05,
"loss": 12.0909,
"step": 49100
},
{
"epoch": 37.701149425287355,
"grad_norm": 1.4591392278671265,
"learning_rate": 2.6445881226053644e-05,
"loss": 11.9453,
"step": 49200
},
{
"epoch": 37.77777777777778,
"grad_norm": 1.3402618169784546,
"learning_rate": 2.6397988505747128e-05,
"loss": 11.9431,
"step": 49300
},
{
"epoch": 37.8544061302682,
"grad_norm": 1.697449803352356,
"learning_rate": 2.6350095785440614e-05,
"loss": 11.8129,
"step": 49400
},
{
"epoch": 37.93103448275862,
"grad_norm": 1.5764317512512207,
"learning_rate": 2.63022030651341e-05,
"loss": 11.975,
"step": 49500
},
{
"epoch": 38.0,
"eval_loss": 12.832439422607422,
"eval_runtime": 44.0844,
"eval_samples_per_second": 29.602,
"eval_steps_per_second": 3.72,
"step": 49590
},
{
"epoch": 38.00766283524904,
"grad_norm": 3.7600104808807373,
"learning_rate": 2.6254310344827588e-05,
"loss": 12.1701,
"step": 49600
},
{
"epoch": 38.08429118773947,
"grad_norm": 1.9188120365142822,
"learning_rate": 2.6206417624521075e-05,
"loss": 12.0672,
"step": 49700
},
{
"epoch": 38.160919540229884,
"grad_norm": 1.5679752826690674,
"learning_rate": 2.615852490421456e-05,
"loss": 11.9374,
"step": 49800
},
{
"epoch": 38.23754789272031,
"grad_norm": 1.6603142023086548,
"learning_rate": 2.6110632183908045e-05,
"loss": 11.8708,
"step": 49900
},
{
"epoch": 38.31417624521073,
"grad_norm": 2.0302236080169678,
"learning_rate": 2.6062739463601532e-05,
"loss": 12.0997,
"step": 50000
},
{
"epoch": 38.39080459770115,
"grad_norm": 1.4646397829055786,
"learning_rate": 2.601484674329502e-05,
"loss": 12.1337,
"step": 50100
},
{
"epoch": 38.46743295019157,
"grad_norm": 2.1434216499328613,
"learning_rate": 2.5966954022988506e-05,
"loss": 12.063,
"step": 50200
},
{
"epoch": 38.54406130268199,
"grad_norm": 1.4451220035552979,
"learning_rate": 2.5919061302681996e-05,
"loss": 11.8743,
"step": 50300
},
{
"epoch": 38.62068965517241,
"grad_norm": 1.4875038862228394,
"learning_rate": 2.5871168582375483e-05,
"loss": 12.1545,
"step": 50400
},
{
"epoch": 38.69731800766284,
"grad_norm": 2.4424338340759277,
"learning_rate": 2.582327586206897e-05,
"loss": 11.9573,
"step": 50500
},
{
"epoch": 38.77394636015325,
"grad_norm": 1.0890432596206665,
"learning_rate": 2.5775383141762454e-05,
"loss": 11.894,
"step": 50600
},
{
"epoch": 38.85057471264368,
"grad_norm": 1.410107970237732,
"learning_rate": 2.572749042145594e-05,
"loss": 12.0408,
"step": 50700
},
{
"epoch": 38.9272030651341,
"grad_norm": 1.1632236242294312,
"learning_rate": 2.5679597701149427e-05,
"loss": 12.0218,
"step": 50800
},
{
"epoch": 39.0,
"eval_loss": 12.819197654724121,
"eval_runtime": 44.0917,
"eval_samples_per_second": 29.597,
"eval_steps_per_second": 3.72,
"step": 50895
},
{
"epoch": 39.00383141762452,
"grad_norm": 1.8346548080444336,
"learning_rate": 2.5631704980842914e-05,
"loss": 11.9914,
"step": 50900
},
{
"epoch": 39.08045977011494,
"grad_norm": 1.3156729936599731,
"learning_rate": 2.55838122605364e-05,
"loss": 11.882,
"step": 51000
},
{
"epoch": 39.157088122605366,
"grad_norm": 1.464136004447937,
"learning_rate": 2.5535919540229885e-05,
"loss": 12.0324,
"step": 51100
},
{
"epoch": 39.23371647509578,
"grad_norm": 1.40706205368042,
"learning_rate": 2.548802681992337e-05,
"loss": 12.0355,
"step": 51200
},
{
"epoch": 39.310344827586206,
"grad_norm": 1.1469753980636597,
"learning_rate": 2.544013409961686e-05,
"loss": 11.8437,
"step": 51300
},
{
"epoch": 39.38697318007663,
"grad_norm": 2.110839605331421,
"learning_rate": 2.5392241379310345e-05,
"loss": 12.0156,
"step": 51400
},
{
"epoch": 39.46360153256705,
"grad_norm": 1.0058891773223877,
"learning_rate": 2.534434865900383e-05,
"loss": 12.093,
"step": 51500
},
{
"epoch": 39.54022988505747,
"grad_norm": 1.7903035879135132,
"learning_rate": 2.5296455938697316e-05,
"loss": 12.1111,
"step": 51600
},
{
"epoch": 39.616858237547895,
"grad_norm": 1.7223442792892456,
"learning_rate": 2.524856321839081e-05,
"loss": 11.8909,
"step": 51700
},
{
"epoch": 39.69348659003831,
"grad_norm": 1.6216609477996826,
"learning_rate": 2.5200670498084293e-05,
"loss": 12.0638,
"step": 51800
},
{
"epoch": 39.770114942528735,
"grad_norm": 2.2488083839416504,
"learning_rate": 2.515277777777778e-05,
"loss": 12.193,
"step": 51900
},
{
"epoch": 39.84674329501916,
"grad_norm": 1.9876821041107178,
"learning_rate": 2.5104885057471267e-05,
"loss": 11.9594,
"step": 52000
},
{
"epoch": 39.923371647509576,
"grad_norm": 2.0479111671447754,
"learning_rate": 2.5056992337164753e-05,
"loss": 11.8695,
"step": 52100
},
{
"epoch": 40.0,
"grad_norm": 2.512753486633301,
"learning_rate": 2.500909961685824e-05,
"loss": 11.9546,
"step": 52200
},
{
"epoch": 40.0,
"eval_loss": 12.806585311889648,
"eval_runtime": 44.0741,
"eval_samples_per_second": 29.609,
"eval_steps_per_second": 3.721,
"step": 52200
},
{
"epoch": 40.076628352490424,
"grad_norm": 1.4184033870697021,
"learning_rate": 2.4961206896551724e-05,
"loss": 11.9875,
"step": 52300
},
{
"epoch": 40.15325670498084,
"grad_norm": 2.1215152740478516,
"learning_rate": 2.491331417624521e-05,
"loss": 11.8898,
"step": 52400
},
{
"epoch": 40.229885057471265,
"grad_norm": 1.5458124876022339,
"learning_rate": 2.4865421455938698e-05,
"loss": 12.2281,
"step": 52500
},
{
"epoch": 40.30651340996169,
"grad_norm": 1.336580753326416,
"learning_rate": 2.4817528735632184e-05,
"loss": 11.743,
"step": 52600
},
{
"epoch": 40.383141762452105,
"grad_norm": 1.1983288526535034,
"learning_rate": 2.476963601532567e-05,
"loss": 12.0526,
"step": 52700
},
{
"epoch": 40.45977011494253,
"grad_norm": 3.6479368209838867,
"learning_rate": 2.4721743295019158e-05,
"loss": 11.9597,
"step": 52800
},
{
"epoch": 40.53639846743295,
"grad_norm": 2.154127359390259,
"learning_rate": 2.467432950191571e-05,
"loss": 11.9651,
"step": 52900
},
{
"epoch": 40.61302681992337,
"grad_norm": 1.476364016532898,
"learning_rate": 2.4626436781609197e-05,
"loss": 11.8092,
"step": 53000
},
{
"epoch": 40.689655172413794,
"grad_norm": 1.9797921180725098,
"learning_rate": 2.4578544061302684e-05,
"loss": 12.1406,
"step": 53100
},
{
"epoch": 40.76628352490422,
"grad_norm": 1.5220038890838623,
"learning_rate": 2.453065134099617e-05,
"loss": 11.8779,
"step": 53200
},
{
"epoch": 40.842911877394634,
"grad_norm": 1.1830068826675415,
"learning_rate": 2.4482758620689654e-05,
"loss": 12.0007,
"step": 53300
},
{
"epoch": 40.91954022988506,
"grad_norm": 1.3260859251022339,
"learning_rate": 2.4434865900383144e-05,
"loss": 12.1607,
"step": 53400
},
{
"epoch": 40.99616858237548,
"grad_norm": 1.8781402111053467,
"learning_rate": 2.438697318007663e-05,
"loss": 11.9159,
"step": 53500
},
{
"epoch": 41.0,
"eval_loss": 12.82541275024414,
"eval_runtime": 44.0679,
"eval_samples_per_second": 29.613,
"eval_steps_per_second": 3.722,
"step": 53505
},
{
"epoch": 41.0727969348659,
"grad_norm": 3.089315891265869,
"learning_rate": 2.4339080459770118e-05,
"loss": 12.0552,
"step": 53600
},
{
"epoch": 41.14942528735632,
"grad_norm": 1.9572243690490723,
"learning_rate": 2.42911877394636e-05,
"loss": 12.0124,
"step": 53700
},
{
"epoch": 41.22605363984675,
"grad_norm": 1.6215753555297852,
"learning_rate": 2.424329501915709e-05,
"loss": 11.9782,
"step": 53800
},
{
"epoch": 41.30268199233716,
"grad_norm": 1.3075189590454102,
"learning_rate": 2.4195402298850575e-05,
"loss": 12.2317,
"step": 53900
},
{
"epoch": 41.37931034482759,
"grad_norm": 1.1214234828948975,
"learning_rate": 2.4147509578544062e-05,
"loss": 12.1511,
"step": 54000
},
{
"epoch": 41.45593869731801,
"grad_norm": 8.386270523071289,
"learning_rate": 2.409961685823755e-05,
"loss": 11.8253,
"step": 54100
},
{
"epoch": 41.53256704980843,
"grad_norm": 5.074198246002197,
"learning_rate": 2.4051724137931036e-05,
"loss": 12.0205,
"step": 54200
},
{
"epoch": 41.60919540229885,
"grad_norm": 1.2190698385238647,
"learning_rate": 2.4003831417624523e-05,
"loss": 11.9438,
"step": 54300
},
{
"epoch": 41.68582375478927,
"grad_norm": 1.3544102907180786,
"learning_rate": 2.395593869731801e-05,
"loss": 12.1235,
"step": 54400
},
{
"epoch": 41.76245210727969,
"grad_norm": 1.080891489982605,
"learning_rate": 2.3908045977011497e-05,
"loss": 11.7676,
"step": 54500
},
{
"epoch": 41.839080459770116,
"grad_norm": 1.453224539756775,
"learning_rate": 2.386015325670498e-05,
"loss": 12.0158,
"step": 54600
},
{
"epoch": 41.91570881226053,
"grad_norm": 1.3428503274917603,
"learning_rate": 2.3812260536398467e-05,
"loss": 11.8066,
"step": 54700
},
{
"epoch": 41.99233716475096,
"grad_norm": 1.3496088981628418,
"learning_rate": 2.3764367816091957e-05,
"loss": 11.8988,
"step": 54800
},
{
"epoch": 42.0,
"eval_loss": 12.804805755615234,
"eval_runtime": 44.1053,
"eval_samples_per_second": 29.588,
"eval_steps_per_second": 3.718,
"step": 54810
},
{
"epoch": 42.06896551724138,
"grad_norm": 1.2151437997817993,
"learning_rate": 2.3716475095785444e-05,
"loss": 12.1893,
"step": 54900
},
{
"epoch": 42.1455938697318,
"grad_norm": 1.6184425354003906,
"learning_rate": 2.3669061302681993e-05,
"loss": 12.0546,
"step": 55000
},
{
"epoch": 42.22222222222222,
"grad_norm": 1.6667332649230957,
"learning_rate": 2.362116858237548e-05,
"loss": 11.7933,
"step": 55100
},
{
"epoch": 42.298850574712645,
"grad_norm": 3.835425615310669,
"learning_rate": 2.3573275862068966e-05,
"loss": 11.9275,
"step": 55200
},
{
"epoch": 42.37547892720306,
"grad_norm": 4.450900554656982,
"learning_rate": 2.3525383141762453e-05,
"loss": 12.1853,
"step": 55300
},
{
"epoch": 42.452107279693486,
"grad_norm": 1.4358230829238892,
"learning_rate": 2.347749042145594e-05,
"loss": 12.0228,
"step": 55400
},
{
"epoch": 42.52873563218391,
"grad_norm": 1.6793595552444458,
"learning_rate": 2.3429597701149427e-05,
"loss": 11.9595,
"step": 55500
},
{
"epoch": 42.60536398467433,
"grad_norm": 1.305600643157959,
"learning_rate": 2.3381704980842914e-05,
"loss": 11.8126,
"step": 55600
},
{
"epoch": 42.68199233716475,
"grad_norm": 1.5794193744659424,
"learning_rate": 2.33338122605364e-05,
"loss": 12.0154,
"step": 55700
},
{
"epoch": 42.758620689655174,
"grad_norm": 1.6401104927062988,
"learning_rate": 2.3285919540229888e-05,
"loss": 11.8344,
"step": 55800
},
{
"epoch": 42.83524904214559,
"grad_norm": 1.6348859071731567,
"learning_rate": 2.323802681992337e-05,
"loss": 12.0174,
"step": 55900
},
{
"epoch": 42.911877394636015,
"grad_norm": 2.6531448364257812,
"learning_rate": 2.3190134099616858e-05,
"loss": 11.8581,
"step": 56000
},
{
"epoch": 42.98850574712644,
"grad_norm": 1.423274040222168,
"learning_rate": 2.3142241379310345e-05,
"loss": 11.9313,
"step": 56100
},
{
"epoch": 43.0,
"eval_loss": 12.791069030761719,
"eval_runtime": 44.1222,
"eval_samples_per_second": 29.577,
"eval_steps_per_second": 3.717,
"step": 56115
},
{
"epoch": 43.065134099616856,
"grad_norm": 1.3258931636810303,
"learning_rate": 2.3094348659003835e-05,
"loss": 11.8864,
"step": 56200
},
{
"epoch": 43.14176245210728,
"grad_norm": 1.4615380764007568,
"learning_rate": 2.304645593869732e-05,
"loss": 12.0657,
"step": 56300
},
{
"epoch": 43.2183908045977,
"grad_norm": 1.4611597061157227,
"learning_rate": 2.2998563218390805e-05,
"loss": 11.9148,
"step": 56400
},
{
"epoch": 43.29501915708812,
"grad_norm": 1.7766637802124023,
"learning_rate": 2.2950670498084292e-05,
"loss": 12.0493,
"step": 56500
},
{
"epoch": 43.371647509578544,
"grad_norm": 1.8123854398727417,
"learning_rate": 2.290277777777778e-05,
"loss": 11.8749,
"step": 56600
},
{
"epoch": 43.44827586206897,
"grad_norm": 2.2500967979431152,
"learning_rate": 2.2854885057471266e-05,
"loss": 12.0237,
"step": 56700
},
{
"epoch": 43.524904214559385,
"grad_norm": 1.44577157497406,
"learning_rate": 2.280699233716475e-05,
"loss": 11.8103,
"step": 56800
},
{
"epoch": 43.60153256704981,
"grad_norm": 1.2959234714508057,
"learning_rate": 2.275909961685824e-05,
"loss": 12.1443,
"step": 56900
},
{
"epoch": 43.67816091954023,
"grad_norm": 1.849253535270691,
"learning_rate": 2.2711206896551727e-05,
"loss": 12.037,
"step": 57000
},
{
"epoch": 43.75478927203065,
"grad_norm": 1.46470046043396,
"learning_rate": 2.266379310344828e-05,
"loss": 12.0392,
"step": 57100
},
{
"epoch": 43.83141762452107,
"grad_norm": 1.7397308349609375,
"learning_rate": 2.2615900383141765e-05,
"loss": 11.8446,
"step": 57200
},
{
"epoch": 43.9080459770115,
"grad_norm": 1.1144057512283325,
"learning_rate": 2.256800766283525e-05,
"loss": 12.0084,
"step": 57300
},
{
"epoch": 43.984674329501914,
"grad_norm": 4.426650047302246,
"learning_rate": 2.2520114942528736e-05,
"loss": 12.0514,
"step": 57400
},
{
"epoch": 44.0,
"eval_loss": 12.808330535888672,
"eval_runtime": 44.0792,
"eval_samples_per_second": 29.606,
"eval_steps_per_second": 3.721,
"step": 57420
},
{
"epoch": 44.06130268199234,
"grad_norm": 1.1355741024017334,
"learning_rate": 2.2472222222222223e-05,
"loss": 11.9243,
"step": 57500
},
{
"epoch": 44.13793103448276,
"grad_norm": 1.5547679662704468,
"learning_rate": 2.2424329501915713e-05,
"loss": 12.0711,
"step": 57600
},
{
"epoch": 44.21455938697318,
"grad_norm": 1.5729808807373047,
"learning_rate": 2.2376436781609196e-05,
"loss": 11.9867,
"step": 57700
},
{
"epoch": 44.2911877394636,
"grad_norm": 1.2912790775299072,
"learning_rate": 2.2328544061302683e-05,
"loss": 11.8632,
"step": 57800
},
{
"epoch": 44.367816091954026,
"grad_norm": 1.2545444965362549,
"learning_rate": 2.228065134099617e-05,
"loss": 12.0665,
"step": 57900
},
{
"epoch": 44.44444444444444,
"grad_norm": 1.3165549039840698,
"learning_rate": 2.2232758620689657e-05,
"loss": 11.842,
"step": 58000
},
{
"epoch": 44.52107279693487,
"grad_norm": 1.7680951356887817,
"learning_rate": 2.218486590038314e-05,
"loss": 11.8055,
"step": 58100
},
{
"epoch": 44.59770114942529,
"grad_norm": 2.2426023483276367,
"learning_rate": 2.2136973180076627e-05,
"loss": 12.1153,
"step": 58200
},
{
"epoch": 44.67432950191571,
"grad_norm": 0.9581509828567505,
"learning_rate": 2.2089080459770118e-05,
"loss": 11.8089,
"step": 58300
},
{
"epoch": 44.75095785440613,
"grad_norm": 2.1268539428710938,
"learning_rate": 2.2041187739463605e-05,
"loss": 11.8902,
"step": 58400
},
{
"epoch": 44.827586206896555,
"grad_norm": 1.2000526189804077,
"learning_rate": 2.1993295019157088e-05,
"loss": 11.8651,
"step": 58500
},
{
"epoch": 44.90421455938697,
"grad_norm": 2.349942684173584,
"learning_rate": 2.1945402298850575e-05,
"loss": 11.9236,
"step": 58600
},
{
"epoch": 44.980842911877396,
"grad_norm": 1.639948844909668,
"learning_rate": 2.1897509578544062e-05,
"loss": 11.9533,
"step": 58700
},
{
"epoch": 45.0,
"eval_loss": 12.792840003967285,
"eval_runtime": 44.0555,
"eval_samples_per_second": 29.622,
"eval_steps_per_second": 3.723,
"step": 58725
},
{
"epoch": 45.05747126436781,
"grad_norm": 0.9822871088981628,
"learning_rate": 2.184961685823755e-05,
"loss": 11.9065,
"step": 58800
},
{
"epoch": 45.13409961685824,
"grad_norm": 5.536319255828857,
"learning_rate": 2.1801724137931036e-05,
"loss": 11.9411,
"step": 58900
},
{
"epoch": 45.21072796934866,
"grad_norm": 1.8267079591751099,
"learning_rate": 2.1753831417624522e-05,
"loss": 11.8592,
"step": 59000
},
{
"epoch": 45.28735632183908,
"grad_norm": 1.453710675239563,
"learning_rate": 2.170593869731801e-05,
"loss": 12.246,
"step": 59100
},
{
"epoch": 45.3639846743295,
"grad_norm": 1.5747921466827393,
"learning_rate": 2.1658045977011496e-05,
"loss": 12.1555,
"step": 59200
},
{
"epoch": 45.440613026819925,
"grad_norm": 0.9929379224777222,
"learning_rate": 2.1610153256704983e-05,
"loss": 11.7682,
"step": 59300
},
{
"epoch": 45.51724137931034,
"grad_norm": 1.4931187629699707,
"learning_rate": 2.1562260536398467e-05,
"loss": 11.8555,
"step": 59400
},
{
"epoch": 45.593869731800766,
"grad_norm": 1.114998459815979,
"learning_rate": 2.1514367816091953e-05,
"loss": 11.8726,
"step": 59500
},
{
"epoch": 45.67049808429119,
"grad_norm": 1.7308725118637085,
"learning_rate": 2.146647509578544e-05,
"loss": 12.0875,
"step": 59600
},
{
"epoch": 45.747126436781606,
"grad_norm": 1.1630358695983887,
"learning_rate": 2.141858237547893e-05,
"loss": 11.8994,
"step": 59700
},
{
"epoch": 45.82375478927203,
"grad_norm": 1.9863486289978027,
"learning_rate": 2.1370689655172414e-05,
"loss": 11.9502,
"step": 59800
},
{
"epoch": 45.900383141762454,
"grad_norm": 1.3612456321716309,
"learning_rate": 2.13227969348659e-05,
"loss": 11.8048,
"step": 59900
},
{
"epoch": 45.97701149425287,
"grad_norm": 1.1734110116958618,
"learning_rate": 2.1274904214559388e-05,
"loss": 12.1155,
"step": 60000
},
{
"epoch": 46.0,
"eval_loss": 12.802705764770508,
"eval_runtime": 44.0902,
"eval_samples_per_second": 29.598,
"eval_steps_per_second": 3.72,
"step": 60030
},
{
"epoch": 46.053639846743295,
"grad_norm": 2.19791841506958,
"learning_rate": 2.1227011494252875e-05,
"loss": 12.0121,
"step": 60100
},
{
"epoch": 46.13026819923372,
"grad_norm": 3.206514358520508,
"learning_rate": 2.1179597701149426e-05,
"loss": 11.9131,
"step": 60200
},
{
"epoch": 46.206896551724135,
"grad_norm": 1.2101006507873535,
"learning_rate": 2.1131704980842913e-05,
"loss": 11.869,
"step": 60300
},
{
"epoch": 46.28352490421456,
"grad_norm": 1.3384582996368408,
"learning_rate": 2.10838122605364e-05,
"loss": 11.7608,
"step": 60400
},
{
"epoch": 46.36015325670498,
"grad_norm": 3.215064764022827,
"learning_rate": 2.1035919540229887e-05,
"loss": 12.0495,
"step": 60500
},
{
"epoch": 46.4367816091954,
"grad_norm": 1.26254403591156,
"learning_rate": 2.0988026819923374e-05,
"loss": 11.8855,
"step": 60600
},
{
"epoch": 46.513409961685824,
"grad_norm": 1.139722466468811,
"learning_rate": 2.094013409961686e-05,
"loss": 12.0157,
"step": 60700
},
{
"epoch": 46.59003831417625,
"grad_norm": 1.9146323204040527,
"learning_rate": 2.0892241379310344e-05,
"loss": 11.8276,
"step": 60800
},
{
"epoch": 46.666666666666664,
"grad_norm": 1.6539549827575684,
"learning_rate": 2.084434865900383e-05,
"loss": 11.9677,
"step": 60900
},
{
"epoch": 46.74329501915709,
"grad_norm": 1.2380534410476685,
"learning_rate": 2.0796455938697318e-05,
"loss": 12.0291,
"step": 61000
},
{
"epoch": 46.81992337164751,
"grad_norm": 1.8375437259674072,
"learning_rate": 2.074856321839081e-05,
"loss": 11.9032,
"step": 61100
},
{
"epoch": 46.89655172413793,
"grad_norm": 2.2188262939453125,
"learning_rate": 2.0700670498084292e-05,
"loss": 12.0465,
"step": 61200
},
{
"epoch": 46.97318007662835,
"grad_norm": 1.1582258939743042,
"learning_rate": 2.065277777777778e-05,
"loss": 11.924,
"step": 61300
},
{
"epoch": 47.0,
"eval_loss": 12.797731399536133,
"eval_runtime": 44.1559,
"eval_samples_per_second": 29.554,
"eval_steps_per_second": 3.714,
"step": 61335
},
{
"epoch": 47.04980842911878,
"grad_norm": 3.067289352416992,
"learning_rate": 2.0604885057471266e-05,
"loss": 11.8265,
"step": 61400
},
{
"epoch": 47.12643678160919,
"grad_norm": 1.3472516536712646,
"learning_rate": 2.0556992337164752e-05,
"loss": 11.8763,
"step": 61500
},
{
"epoch": 47.20306513409962,
"grad_norm": 1.4235740900039673,
"learning_rate": 2.050909961685824e-05,
"loss": 11.9473,
"step": 61600
},
{
"epoch": 47.27969348659004,
"grad_norm": 1.3170359134674072,
"learning_rate": 2.0461206896551723e-05,
"loss": 11.9381,
"step": 61700
},
{
"epoch": 47.35632183908046,
"grad_norm": 1.6014246940612793,
"learning_rate": 2.0413314176245213e-05,
"loss": 11.9074,
"step": 61800
},
{
"epoch": 47.43295019157088,
"grad_norm": 1.3270535469055176,
"learning_rate": 2.03654214559387e-05,
"loss": 11.9903,
"step": 61900
},
{
"epoch": 47.509578544061306,
"grad_norm": 1.1905503273010254,
"learning_rate": 2.0317528735632187e-05,
"loss": 11.9629,
"step": 62000
},
{
"epoch": 47.58620689655172,
"grad_norm": 1.546738862991333,
"learning_rate": 2.026963601532567e-05,
"loss": 11.831,
"step": 62100
},
{
"epoch": 47.662835249042146,
"grad_norm": 1.5887172222137451,
"learning_rate": 2.0221743295019157e-05,
"loss": 12.0534,
"step": 62200
},
{
"epoch": 47.73946360153257,
"grad_norm": 1.3189942836761475,
"learning_rate": 2.0173850574712644e-05,
"loss": 11.9131,
"step": 62300
},
{
"epoch": 47.81609195402299,
"grad_norm": 1.9591014385223389,
"learning_rate": 2.012595785440613e-05,
"loss": 11.8583,
"step": 62400
},
{
"epoch": 47.89272030651341,
"grad_norm": 1.6344765424728394,
"learning_rate": 2.0078065134099618e-05,
"loss": 11.9921,
"step": 62500
},
{
"epoch": 47.969348659003835,
"grad_norm": 1.1810266971588135,
"learning_rate": 2.0030172413793105e-05,
"loss": 11.9987,
"step": 62600
},
{
"epoch": 48.0,
"eval_loss": 12.767735481262207,
"eval_runtime": 44.144,
"eval_samples_per_second": 29.562,
"eval_steps_per_second": 3.715,
"step": 62640
},
{
"epoch": 48.04597701149425,
"grad_norm": 1.4370075464248657,
"learning_rate": 1.998227969348659e-05,
"loss": 12.0014,
"step": 62700
},
{
"epoch": 48.122605363984675,
"grad_norm": 1.2901791334152222,
"learning_rate": 1.993438697318008e-05,
"loss": 12.0385,
"step": 62800
},
{
"epoch": 48.1992337164751,
"grad_norm": 1.2324562072753906,
"learning_rate": 1.9886494252873565e-05,
"loss": 11.9594,
"step": 62900
},
{
"epoch": 48.275862068965516,
"grad_norm": 1.40041983127594,
"learning_rate": 1.983860153256705e-05,
"loss": 11.76,
"step": 63000
},
{
"epoch": 48.35249042145594,
"grad_norm": 1.5981560945510864,
"learning_rate": 1.9790708812260536e-05,
"loss": 11.8416,
"step": 63100
},
{
"epoch": 48.42911877394636,
"grad_norm": 1.5366255044937134,
"learning_rate": 1.974329501915709e-05,
"loss": 11.9168,
"step": 63200
},
{
"epoch": 48.50574712643678,
"grad_norm": 2.1091346740722656,
"learning_rate": 1.9695402298850578e-05,
"loss": 11.7809,
"step": 63300
},
{
"epoch": 48.582375478927204,
"grad_norm": 3.076678991317749,
"learning_rate": 1.964750957854406e-05,
"loss": 11.8881,
"step": 63400
},
{
"epoch": 48.65900383141762,
"grad_norm": 1.6555073261260986,
"learning_rate": 1.9599616858237548e-05,
"loss": 11.6799,
"step": 63500
},
{
"epoch": 48.735632183908045,
"grad_norm": 1.2696727514266968,
"learning_rate": 1.9551724137931035e-05,
"loss": 12.0306,
"step": 63600
},
{
"epoch": 48.81226053639847,
"grad_norm": 1.739827275276184,
"learning_rate": 1.9503831417624522e-05,
"loss": 12.1005,
"step": 63700
},
{
"epoch": 48.888888888888886,
"grad_norm": 1.187231421470642,
"learning_rate": 1.945593869731801e-05,
"loss": 11.9703,
"step": 63800
},
{
"epoch": 48.96551724137931,
"grad_norm": 2.756282091140747,
"learning_rate": 1.9408045977011496e-05,
"loss": 12.0693,
"step": 63900
},
{
"epoch": 49.0,
"eval_loss": 12.775006294250488,
"eval_runtime": 44.1249,
"eval_samples_per_second": 29.575,
"eval_steps_per_second": 3.717,
"step": 63945
},
{
"epoch": 49.04214559386973,
"grad_norm": 0.967854917049408,
"learning_rate": 1.9360153256704983e-05,
"loss": 11.9437,
"step": 64000
},
{
"epoch": 49.11877394636015,
"grad_norm": 1.2055004835128784,
"learning_rate": 1.931226053639847e-05,
"loss": 11.9037,
"step": 64100
},
{
"epoch": 49.195402298850574,
"grad_norm": 1.6203746795654297,
"learning_rate": 1.9264367816091956e-05,
"loss": 11.9823,
"step": 64200
},
{
"epoch": 49.272030651341,
"grad_norm": 1.1399292945861816,
"learning_rate": 1.921647509578544e-05,
"loss": 12.0721,
"step": 64300
},
{
"epoch": 49.348659003831415,
"grad_norm": 1.3431105613708496,
"learning_rate": 1.9168582375478927e-05,
"loss": 11.8897,
"step": 64400
},
{
"epoch": 49.42528735632184,
"grad_norm": 1.316723346710205,
"learning_rate": 1.9120689655172414e-05,
"loss": 11.9025,
"step": 64500
},
{
"epoch": 49.50191570881226,
"grad_norm": 1.8449369668960571,
"learning_rate": 1.9072796934865904e-05,
"loss": 11.6683,
"step": 64600
},
{
"epoch": 49.57854406130268,
"grad_norm": 1.3772321939468384,
"learning_rate": 1.9024904214559387e-05,
"loss": 12.2022,
"step": 64700
},
{
"epoch": 49.6551724137931,
"grad_norm": 2.2538058757781982,
"learning_rate": 1.8977011494252874e-05,
"loss": 11.8425,
"step": 64800
},
{
"epoch": 49.73180076628353,
"grad_norm": 2.1310970783233643,
"learning_rate": 1.892911877394636e-05,
"loss": 11.9638,
"step": 64900
},
{
"epoch": 49.808429118773944,
"grad_norm": 1.2570499181747437,
"learning_rate": 1.8881226053639848e-05,
"loss": 12.0367,
"step": 65000
},
{
"epoch": 49.88505747126437,
"grad_norm": 1.6000453233718872,
"learning_rate": 1.8833333333333335e-05,
"loss": 12.0249,
"step": 65100
},
{
"epoch": 49.96168582375479,
"grad_norm": 1.2556895017623901,
"learning_rate": 1.878544061302682e-05,
"loss": 11.9285,
"step": 65200
},
{
"epoch": 50.0,
"eval_loss": 12.788679122924805,
"eval_runtime": 44.0734,
"eval_samples_per_second": 29.61,
"eval_steps_per_second": 3.721,
"step": 65250
},
{
"epoch": 50.03831417624521,
"grad_norm": 1.4611543416976929,
"learning_rate": 1.873754789272031e-05,
"loss": 12.0139,
"step": 65300
},
{
"epoch": 50.11494252873563,
"grad_norm": 1.3939285278320312,
"learning_rate": 1.869013409961686e-05,
"loss": 12.1466,
"step": 65400
},
{
"epoch": 50.191570881226056,
"grad_norm": 1.378446102142334,
"learning_rate": 1.8642241379310347e-05,
"loss": 12.0221,
"step": 65500
},
{
"epoch": 50.26819923371647,
"grad_norm": 1.1458476781845093,
"learning_rate": 1.859434865900383e-05,
"loss": 11.98,
"step": 65600
},
{
"epoch": 50.3448275862069,
"grad_norm": 1.2113792896270752,
"learning_rate": 1.8546455938697318e-05,
"loss": 11.7938,
"step": 65700
},
{
"epoch": 50.42145593869732,
"grad_norm": 3.7647705078125,
"learning_rate": 1.8498563218390804e-05,
"loss": 12.046,
"step": 65800
},
{
"epoch": 50.49808429118774,
"grad_norm": 1.4086334705352783,
"learning_rate": 1.845067049808429e-05,
"loss": 12.0137,
"step": 65900
},
{
"epoch": 50.57471264367816,
"grad_norm": 2.212301254272461,
"learning_rate": 1.8402777777777778e-05,
"loss": 11.8535,
"step": 66000
},
{
"epoch": 50.651340996168585,
"grad_norm": 1.1334259510040283,
"learning_rate": 1.8354885057471265e-05,
"loss": 11.7534,
"step": 66100
},
{
"epoch": 50.727969348659,
"grad_norm": 1.3607604503631592,
"learning_rate": 1.8306992337164752e-05,
"loss": 12.1351,
"step": 66200
},
{
"epoch": 50.804597701149426,
"grad_norm": 0.9516454935073853,
"learning_rate": 1.825909961685824e-05,
"loss": 11.8739,
"step": 66300
},
{
"epoch": 50.88122605363985,
"grad_norm": 1.7874857187271118,
"learning_rate": 1.8211206896551726e-05,
"loss": 12.0046,
"step": 66400
},
{
"epoch": 50.95785440613027,
"grad_norm": 1.1303731203079224,
"learning_rate": 1.816331417624521e-05,
"loss": 11.8135,
"step": 66500
},
{
"epoch": 51.0,
"eval_loss": 12.762798309326172,
"eval_runtime": 44.1477,
"eval_samples_per_second": 29.56,
"eval_steps_per_second": 3.715,
"step": 66555
},
{
"epoch": 51.03448275862069,
"grad_norm": 2.8881723880767822,
"learning_rate": 1.8115421455938696e-05,
"loss": 11.8533,
"step": 66600
},
{
"epoch": 51.111111111111114,
"grad_norm": 1.2278690338134766,
"learning_rate": 1.8067528735632186e-05,
"loss": 11.9214,
"step": 66700
},
{
"epoch": 51.18773946360153,
"grad_norm": 1.9933656454086304,
"learning_rate": 1.8019636015325673e-05,
"loss": 11.8527,
"step": 66800
},
{
"epoch": 51.264367816091955,
"grad_norm": 1.4205143451690674,
"learning_rate": 1.7971743295019157e-05,
"loss": 12.0251,
"step": 66900
},
{
"epoch": 51.34099616858238,
"grad_norm": 1.319817304611206,
"learning_rate": 1.7923850574712644e-05,
"loss": 12.0983,
"step": 67000
},
{
"epoch": 51.417624521072796,
"grad_norm": 1.6209360361099243,
"learning_rate": 1.787595785440613e-05,
"loss": 11.8053,
"step": 67100
},
{
"epoch": 51.49425287356322,
"grad_norm": 1.0465126037597656,
"learning_rate": 1.7828065134099617e-05,
"loss": 12.0158,
"step": 67200
},
{
"epoch": 51.57088122605364,
"grad_norm": 1.4087551832199097,
"learning_rate": 1.7780172413793104e-05,
"loss": 11.9305,
"step": 67300
},
{
"epoch": 51.64750957854406,
"grad_norm": 1.121779203414917,
"learning_rate": 1.773227969348659e-05,
"loss": 12.1881,
"step": 67400
},
{
"epoch": 51.724137931034484,
"grad_norm": 1.5989633798599243,
"learning_rate": 1.7684386973180078e-05,
"loss": 11.9698,
"step": 67500
},
{
"epoch": 51.8007662835249,
"grad_norm": 1.1244069337844849,
"learning_rate": 1.7636494252873565e-05,
"loss": 11.7475,
"step": 67600
},
{
"epoch": 51.877394636015325,
"grad_norm": 1.2594223022460938,
"learning_rate": 1.7589080459770117e-05,
"loss": 11.9611,
"step": 67700
},
{
"epoch": 51.95402298850575,
"grad_norm": 1.6870946884155273,
"learning_rate": 1.7541187739463604e-05,
"loss": 11.7075,
"step": 67800
},
{
"epoch": 52.0,
"eval_loss": 12.814347267150879,
"eval_runtime": 44.0743,
"eval_samples_per_second": 29.609,
"eval_steps_per_second": 3.721,
"step": 67860
},
{
"epoch": 52.030651340996165,
"grad_norm": 1.1319911479949951,
"learning_rate": 1.7493295019157087e-05,
"loss": 11.8327,
"step": 67900
},
{
"epoch": 52.10727969348659,
"grad_norm": 1.0522786378860474,
"learning_rate": 1.7445402298850574e-05,
"loss": 11.858,
"step": 68000
},
{
"epoch": 52.18390804597701,
"grad_norm": 1.7333852052688599,
"learning_rate": 1.7397509578544064e-05,
"loss": 12.037,
"step": 68100
},
{
"epoch": 52.26053639846743,
"grad_norm": 1.7924898862838745,
"learning_rate": 1.734961685823755e-05,
"loss": 12.0778,
"step": 68200
},
{
"epoch": 52.337164750957854,
"grad_norm": 1.221550464630127,
"learning_rate": 1.7301724137931035e-05,
"loss": 12.1365,
"step": 68300
},
{
"epoch": 52.41379310344828,
"grad_norm": 1.6241466999053955,
"learning_rate": 1.725383141762452e-05,
"loss": 12.0267,
"step": 68400
},
{
"epoch": 52.490421455938694,
"grad_norm": 1.7579493522644043,
"learning_rate": 1.720593869731801e-05,
"loss": 11.7834,
"step": 68500
},
{
"epoch": 52.56704980842912,
"grad_norm": 1.4909967184066772,
"learning_rate": 1.7158045977011495e-05,
"loss": 11.9632,
"step": 68600
},
{
"epoch": 52.64367816091954,
"grad_norm": 2.0708203315734863,
"learning_rate": 1.7110153256704982e-05,
"loss": 11.9318,
"step": 68700
},
{
"epoch": 52.72030651340996,
"grad_norm": 1.1900310516357422,
"learning_rate": 1.706226053639847e-05,
"loss": 11.8145,
"step": 68800
},
{
"epoch": 52.79693486590038,
"grad_norm": 1.2245934009552002,
"learning_rate": 1.7014367816091956e-05,
"loss": 11.6663,
"step": 68900
},
{
"epoch": 52.87356321839081,
"grad_norm": 1.6178796291351318,
"learning_rate": 1.6966475095785443e-05,
"loss": 11.9844,
"step": 69000
},
{
"epoch": 52.95019157088122,
"grad_norm": 1.2077674865722656,
"learning_rate": 1.691858237547893e-05,
"loss": 11.6393,
"step": 69100
},
{
"epoch": 53.0,
"eval_loss": 12.77491283416748,
"eval_runtime": 44.1152,
"eval_samples_per_second": 29.582,
"eval_steps_per_second": 3.718,
"step": 69165
},
{
"epoch": 53.02681992337165,
"grad_norm": 1.2087703943252563,
"learning_rate": 1.6870689655172413e-05,
"loss": 11.8316,
"step": 69200
},
{
"epoch": 53.10344827586207,
"grad_norm": 1.472959280014038,
"learning_rate": 1.68227969348659e-05,
"loss": 11.9068,
"step": 69300
},
{
"epoch": 53.18007662835249,
"grad_norm": 1.2973859310150146,
"learning_rate": 1.6774904214559387e-05,
"loss": 11.8753,
"step": 69400
},
{
"epoch": 53.25670498084291,
"grad_norm": 1.3909817934036255,
"learning_rate": 1.6727011494252877e-05,
"loss": 11.6868,
"step": 69500
},
{
"epoch": 53.333333333333336,
"grad_norm": 1.1226869821548462,
"learning_rate": 1.667911877394636e-05,
"loss": 11.7399,
"step": 69600
},
{
"epoch": 53.40996168582375,
"grad_norm": 1.6086245775222778,
"learning_rate": 1.6631226053639847e-05,
"loss": 11.9871,
"step": 69700
},
{
"epoch": 53.486590038314176,
"grad_norm": 5.143097400665283,
"learning_rate": 1.65838122605364e-05,
"loss": 12.0991,
"step": 69800
},
{
"epoch": 53.5632183908046,
"grad_norm": 1.1883777379989624,
"learning_rate": 1.6535919540229886e-05,
"loss": 11.7275,
"step": 69900
},
{
"epoch": 53.63984674329502,
"grad_norm": 1.152468204498291,
"learning_rate": 1.6488026819923373e-05,
"loss": 11.9268,
"step": 70000
},
{
"epoch": 53.71647509578544,
"grad_norm": 1.6981552839279175,
"learning_rate": 1.6440134099616856e-05,
"loss": 12.0293,
"step": 70100
},
{
"epoch": 53.793103448275865,
"grad_norm": 1.6067506074905396,
"learning_rate": 1.6392241379310347e-05,
"loss": 11.9477,
"step": 70200
},
{
"epoch": 53.86973180076628,
"grad_norm": 3.569709539413452,
"learning_rate": 1.6344348659003834e-05,
"loss": 11.8055,
"step": 70300
},
{
"epoch": 53.946360153256705,
"grad_norm": 2.3322157859802246,
"learning_rate": 1.629645593869732e-05,
"loss": 12.027,
"step": 70400
},
{
"epoch": 54.0,
"eval_loss": 12.753838539123535,
"eval_runtime": 44.1081,
"eval_samples_per_second": 29.586,
"eval_steps_per_second": 3.718,
"step": 70470
},
{
"epoch": 54.02298850574713,
"grad_norm": 1.4370397329330444,
"learning_rate": 1.6248563218390804e-05,
"loss": 12.0639,
"step": 70500
},
{
"epoch": 54.099616858237546,
"grad_norm": 2.486645221710205,
"learning_rate": 1.620067049808429e-05,
"loss": 11.9231,
"step": 70600
},
{
"epoch": 54.17624521072797,
"grad_norm": 2.0936434268951416,
"learning_rate": 1.6152777777777778e-05,
"loss": 11.9161,
"step": 70700
},
{
"epoch": 54.252873563218394,
"grad_norm": 1.5211490392684937,
"learning_rate": 1.6104885057471265e-05,
"loss": 11.9338,
"step": 70800
},
{
"epoch": 54.32950191570881,
"grad_norm": 1.035090684890747,
"learning_rate": 1.605699233716475e-05,
"loss": 11.7872,
"step": 70900
},
{
"epoch": 54.406130268199234,
"grad_norm": 1.617077112197876,
"learning_rate": 1.600909961685824e-05,
"loss": 11.9772,
"step": 71000
},
{
"epoch": 54.48275862068966,
"grad_norm": 1.3988826274871826,
"learning_rate": 1.5961206896551725e-05,
"loss": 12.0088,
"step": 71100
},
{
"epoch": 54.559386973180075,
"grad_norm": 1.7126933336257935,
"learning_rate": 1.5913314176245212e-05,
"loss": 11.9831,
"step": 71200
},
{
"epoch": 54.6360153256705,
"grad_norm": 2.3251850605010986,
"learning_rate": 1.58654214559387e-05,
"loss": 11.7345,
"step": 71300
},
{
"epoch": 54.71264367816092,
"grad_norm": 1.6456447839736938,
"learning_rate": 1.5817528735632183e-05,
"loss": 12.0158,
"step": 71400
},
{
"epoch": 54.78927203065134,
"grad_norm": 2.1808829307556152,
"learning_rate": 1.576963601532567e-05,
"loss": 12.0169,
"step": 71500
},
{
"epoch": 54.86590038314176,
"grad_norm": 2.2233774662017822,
"learning_rate": 1.572174329501916e-05,
"loss": 11.9144,
"step": 71600
},
{
"epoch": 54.94252873563218,
"grad_norm": 1.5419303178787231,
"learning_rate": 1.5673850574712647e-05,
"loss": 11.7915,
"step": 71700
},
{
"epoch": 55.0,
"eval_loss": 12.735248565673828,
"eval_runtime": 44.0526,
"eval_samples_per_second": 29.624,
"eval_steps_per_second": 3.723,
"step": 71775
},
{
"epoch": 55.019157088122604,
"grad_norm": 2.4967896938323975,
"learning_rate": 1.562595785440613e-05,
"loss": 12.1777,
"step": 71800
},
{
"epoch": 55.09578544061303,
"grad_norm": 1.6103179454803467,
"learning_rate": 1.5578065134099617e-05,
"loss": 12.0236,
"step": 71900
},
{
"epoch": 55.172413793103445,
"grad_norm": 1.058643102645874,
"learning_rate": 1.553065134099617e-05,
"loss": 11.9485,
"step": 72000
},
{
"epoch": 55.24904214559387,
"grad_norm": 1.1860133409500122,
"learning_rate": 1.5482758620689656e-05,
"loss": 11.7885,
"step": 72100
},
{
"epoch": 55.32567049808429,
"grad_norm": 2.6516213417053223,
"learning_rate": 1.5434865900383142e-05,
"loss": 11.8373,
"step": 72200
},
{
"epoch": 55.40229885057471,
"grad_norm": 1.3108186721801758,
"learning_rate": 1.538697318007663e-05,
"loss": 11.8938,
"step": 72300
},
{
"epoch": 55.47892720306513,
"grad_norm": 2.721954345703125,
"learning_rate": 1.5339080459770116e-05,
"loss": 11.873,
"step": 72400
},
{
"epoch": 55.55555555555556,
"grad_norm": 1.0352996587753296,
"learning_rate": 1.5291187739463603e-05,
"loss": 12.025,
"step": 72500
},
{
"epoch": 55.632183908045974,
"grad_norm": 1.258169412612915,
"learning_rate": 1.5243295019157088e-05,
"loss": 11.9444,
"step": 72600
},
{
"epoch": 55.7088122605364,
"grad_norm": 2.314866781234741,
"learning_rate": 1.5195402298850575e-05,
"loss": 11.711,
"step": 72700
},
{
"epoch": 55.78544061302682,
"grad_norm": 1.308590292930603,
"learning_rate": 1.5147509578544062e-05,
"loss": 12.0446,
"step": 72800
},
{
"epoch": 55.86206896551724,
"grad_norm": 2.928891897201538,
"learning_rate": 1.5099616858237547e-05,
"loss": 11.9413,
"step": 72900
},
{
"epoch": 55.93869731800766,
"grad_norm": 1.048743724822998,
"learning_rate": 1.5051724137931036e-05,
"loss": 11.791,
"step": 73000
},
{
"epoch": 56.0,
"eval_loss": 12.750344276428223,
"eval_runtime": 44.088,
"eval_samples_per_second": 29.6,
"eval_steps_per_second": 3.72,
"step": 73080
},
{
"epoch": 56.015325670498086,
"grad_norm": 3.6337478160858154,
"learning_rate": 1.5003831417624523e-05,
"loss": 11.9951,
"step": 73100
},
{
"epoch": 56.0919540229885,
"grad_norm": 1.7665445804595947,
"learning_rate": 1.495593869731801e-05,
"loss": 12.1332,
"step": 73200
},
{
"epoch": 56.16858237547893,
"grad_norm": 1.4894465208053589,
"learning_rate": 1.4908045977011495e-05,
"loss": 11.7198,
"step": 73300
},
{
"epoch": 56.24521072796935,
"grad_norm": 1.0169578790664673,
"learning_rate": 1.4860153256704982e-05,
"loss": 12.0523,
"step": 73400
},
{
"epoch": 56.32183908045977,
"grad_norm": 1.2872236967086792,
"learning_rate": 1.4812260536398467e-05,
"loss": 11.8438,
"step": 73500
},
{
"epoch": 56.39846743295019,
"grad_norm": 1.1032931804656982,
"learning_rate": 1.4764367816091954e-05,
"loss": 11.9058,
"step": 73600
},
{
"epoch": 56.475095785440615,
"grad_norm": 1.4371570348739624,
"learning_rate": 1.4716475095785442e-05,
"loss": 11.9199,
"step": 73700
},
{
"epoch": 56.55172413793103,
"grad_norm": 1.9667787551879883,
"learning_rate": 1.4668582375478929e-05,
"loss": 11.899,
"step": 73800
},
{
"epoch": 56.628352490421456,
"grad_norm": 1.2465131282806396,
"learning_rate": 1.4620689655172414e-05,
"loss": 11.9303,
"step": 73900
},
{
"epoch": 56.70498084291188,
"grad_norm": 1.2738486528396606,
"learning_rate": 1.4573275862068966e-05,
"loss": 11.9897,
"step": 74000
},
{
"epoch": 56.7816091954023,
"grad_norm": 1.7295411825180054,
"learning_rate": 1.4525383141762453e-05,
"loss": 11.989,
"step": 74100
},
{
"epoch": 56.85823754789272,
"grad_norm": 3.2072668075561523,
"learning_rate": 1.4477490421455938e-05,
"loss": 11.8107,
"step": 74200
},
{
"epoch": 56.934865900383144,
"grad_norm": 1.3828212022781372,
"learning_rate": 1.4429597701149425e-05,
"loss": 11.7899,
"step": 74300
},
{
"epoch": 57.0,
"eval_loss": 12.746719360351562,
"eval_runtime": 44.0689,
"eval_samples_per_second": 29.613,
"eval_steps_per_second": 3.721,
"step": 74385
},
{
"epoch": 57.01149425287356,
"grad_norm": 1.1235148906707764,
"learning_rate": 1.4381704980842914e-05,
"loss": 11.9095,
"step": 74400
},
{
"epoch": 57.088122605363985,
"grad_norm": 1.3013513088226318,
"learning_rate": 1.43338122605364e-05,
"loss": 11.8367,
"step": 74500
},
{
"epoch": 57.16475095785441,
"grad_norm": 1.46478271484375,
"learning_rate": 1.4285919540229886e-05,
"loss": 11.8926,
"step": 74600
},
{
"epoch": 57.241379310344826,
"grad_norm": 1.7883129119873047,
"learning_rate": 1.4238026819923373e-05,
"loss": 11.7109,
"step": 74700
},
{
"epoch": 57.31800766283525,
"grad_norm": 2.2156434059143066,
"learning_rate": 1.419013409961686e-05,
"loss": 11.9904,
"step": 74800
},
{
"epoch": 57.39463601532567,
"grad_norm": 1.963996410369873,
"learning_rate": 1.4142241379310345e-05,
"loss": 11.8243,
"step": 74900
},
{
"epoch": 57.47126436781609,
"grad_norm": 1.5265462398529053,
"learning_rate": 1.4094348659003831e-05,
"loss": 11.982,
"step": 75000
},
{
"epoch": 57.547892720306514,
"grad_norm": 1.5820256471633911,
"learning_rate": 1.404645593869732e-05,
"loss": 12.0055,
"step": 75100
},
{
"epoch": 57.62452107279694,
"grad_norm": 1.2654030323028564,
"learning_rate": 1.3998563218390807e-05,
"loss": 11.8634,
"step": 75200
},
{
"epoch": 57.701149425287355,
"grad_norm": 2.1730732917785645,
"learning_rate": 1.3950670498084292e-05,
"loss": 12.098,
"step": 75300
},
{
"epoch": 57.77777777777778,
"grad_norm": 1.7732394933700562,
"learning_rate": 1.3902777777777779e-05,
"loss": 11.856,
"step": 75400
},
{
"epoch": 57.8544061302682,
"grad_norm": 1.366039514541626,
"learning_rate": 1.3854885057471264e-05,
"loss": 12.0139,
"step": 75500
},
{
"epoch": 57.93103448275862,
"grad_norm": 2.9070754051208496,
"learning_rate": 1.3806992337164751e-05,
"loss": 11.9716,
"step": 75600
},
{
"epoch": 58.0,
"eval_loss": 12.731040000915527,
"eval_runtime": 44.0877,
"eval_samples_per_second": 29.6,
"eval_steps_per_second": 3.72,
"step": 75690
},
{
"epoch": 58.00766283524904,
"grad_norm": 2.1817991733551025,
"learning_rate": 1.3759099616858236e-05,
"loss": 11.906,
"step": 75700
},
{
"epoch": 58.08429118773947,
"grad_norm": 1.2766177654266357,
"learning_rate": 1.3711206896551726e-05,
"loss": 12.0479,
"step": 75800
},
{
"epoch": 58.160919540229884,
"grad_norm": 2.82973575592041,
"learning_rate": 1.3663314176245212e-05,
"loss": 11.947,
"step": 75900
},
{
"epoch": 58.23754789272031,
"grad_norm": 1.2385036945343018,
"learning_rate": 1.3615421455938699e-05,
"loss": 11.9196,
"step": 76000
},
{
"epoch": 58.31417624521073,
"grad_norm": 1.3823829889297485,
"learning_rate": 1.3567528735632184e-05,
"loss": 11.9057,
"step": 76100
},
{
"epoch": 58.39080459770115,
"grad_norm": 1.472506046295166,
"learning_rate": 1.351963601532567e-05,
"loss": 11.9563,
"step": 76200
},
{
"epoch": 58.46743295019157,
"grad_norm": 1.5811665058135986,
"learning_rate": 1.3472222222222222e-05,
"loss": 11.8257,
"step": 76300
},
{
"epoch": 58.54406130268199,
"grad_norm": 1.5588597059249878,
"learning_rate": 1.3424329501915708e-05,
"loss": 11.8564,
"step": 76400
},
{
"epoch": 58.62068965517241,
"grad_norm": 1.5810322761535645,
"learning_rate": 1.3376436781609198e-05,
"loss": 11.8566,
"step": 76500
},
{
"epoch": 58.69731800766284,
"grad_norm": 1.5648218393325806,
"learning_rate": 1.3328544061302683e-05,
"loss": 11.9988,
"step": 76600
},
{
"epoch": 58.77394636015325,
"grad_norm": 1.8077315092086792,
"learning_rate": 1.328065134099617e-05,
"loss": 11.7739,
"step": 76700
},
{
"epoch": 58.85057471264368,
"grad_norm": 1.1517853736877441,
"learning_rate": 1.3232758620689655e-05,
"loss": 11.9046,
"step": 76800
},
{
"epoch": 58.9272030651341,
"grad_norm": 1.4639145135879517,
"learning_rate": 1.3184865900383142e-05,
"loss": 11.99,
"step": 76900
},
{
"epoch": 59.0,
"eval_loss": 12.737883567810059,
"eval_runtime": 44.0757,
"eval_samples_per_second": 29.608,
"eval_steps_per_second": 3.721,
"step": 76995
},
{
"epoch": 59.00383141762452,
"grad_norm": 0.9936187267303467,
"learning_rate": 1.3136973180076629e-05,
"loss": 11.9348,
"step": 77000
},
{
"epoch": 59.08045977011494,
"grad_norm": 1.227501630783081,
"learning_rate": 1.3089080459770114e-05,
"loss": 11.9054,
"step": 77100
},
{
"epoch": 59.157088122605366,
"grad_norm": 1.1214205026626587,
"learning_rate": 1.3041187739463603e-05,
"loss": 11.7912,
"step": 77200
},
{
"epoch": 59.23371647509578,
"grad_norm": 1.3010284900665283,
"learning_rate": 1.299329501915709e-05,
"loss": 11.8542,
"step": 77300
},
{
"epoch": 59.310344827586206,
"grad_norm": 1.291937232017517,
"learning_rate": 1.2945402298850576e-05,
"loss": 11.8613,
"step": 77400
},
{
"epoch": 59.38697318007663,
"grad_norm": 1.224834680557251,
"learning_rate": 1.2897509578544062e-05,
"loss": 11.905,
"step": 77500
},
{
"epoch": 59.46360153256705,
"grad_norm": 1.308899998664856,
"learning_rate": 1.2849616858237548e-05,
"loss": 11.9067,
"step": 77600
},
{
"epoch": 59.54022988505747,
"grad_norm": 1.4333239793777466,
"learning_rate": 1.2801724137931034e-05,
"loss": 11.8825,
"step": 77700
},
{
"epoch": 59.616858237547895,
"grad_norm": 1.0542117357254028,
"learning_rate": 1.275383141762452e-05,
"loss": 12.1948,
"step": 77800
},
{
"epoch": 59.69348659003831,
"grad_norm": 1.9502829313278198,
"learning_rate": 1.2705938697318009e-05,
"loss": 11.9644,
"step": 77900
},
{
"epoch": 59.770114942528735,
"grad_norm": 1.3281497955322266,
"learning_rate": 1.2658045977011496e-05,
"loss": 11.8953,
"step": 78000
},
{
"epoch": 59.84674329501916,
"grad_norm": 1.2546237707138062,
"learning_rate": 1.2610153256704981e-05,
"loss": 11.8375,
"step": 78100
},
{
"epoch": 59.923371647509576,
"grad_norm": 1.1630369424819946,
"learning_rate": 1.2562260536398468e-05,
"loss": 11.7133,
"step": 78200
},
{
"epoch": 60.0,
"grad_norm": 1.7483701705932617,
"learning_rate": 1.2514367816091955e-05,
"loss": 12.2012,
"step": 78300
},
{
"epoch": 60.0,
"eval_loss": 12.731696128845215,
"eval_runtime": 44.1463,
"eval_samples_per_second": 29.561,
"eval_steps_per_second": 3.715,
"step": 78300
},
{
"epoch": 60.076628352490424,
"grad_norm": 2.260547399520874,
"learning_rate": 1.2466954022988505e-05,
"loss": 11.9756,
"step": 78400
},
{
"epoch": 60.15325670498084,
"grad_norm": 1.387416124343872,
"learning_rate": 1.2419061302681993e-05,
"loss": 11.9715,
"step": 78500
},
{
"epoch": 60.229885057471265,
"grad_norm": 4.537426948547363,
"learning_rate": 1.2371168582375479e-05,
"loss": 11.6355,
"step": 78600
},
{
"epoch": 60.30651340996169,
"grad_norm": 1.930817723274231,
"learning_rate": 1.2323275862068966e-05,
"loss": 11.6992,
"step": 78700
},
{
"epoch": 60.383141762452105,
"grad_norm": 1.7206836938858032,
"learning_rate": 1.2275383141762452e-05,
"loss": 11.8606,
"step": 78800
},
{
"epoch": 60.45977011494253,
"grad_norm": 1.7796626091003418,
"learning_rate": 1.222749042145594e-05,
"loss": 11.8648,
"step": 78900
},
{
"epoch": 60.53639846743295,
"grad_norm": 1.6132935285568237,
"learning_rate": 1.2179597701149426e-05,
"loss": 11.7958,
"step": 79000
},
{
"epoch": 60.61302681992337,
"grad_norm": 1.2063769102096558,
"learning_rate": 1.2131704980842913e-05,
"loss": 11.8877,
"step": 79100
},
{
"epoch": 60.689655172413794,
"grad_norm": 1.6793837547302246,
"learning_rate": 1.20838122605364e-05,
"loss": 11.9401,
"step": 79200
},
{
"epoch": 60.76628352490422,
"grad_norm": 2.0831589698791504,
"learning_rate": 1.2035919540229885e-05,
"loss": 11.832,
"step": 79300
},
{
"epoch": 60.842911877394634,
"grad_norm": 1.4812095165252686,
"learning_rate": 1.1988026819923372e-05,
"loss": 12.0039,
"step": 79400
},
{
"epoch": 60.91954022988506,
"grad_norm": 2.111269474029541,
"learning_rate": 1.1940134099616859e-05,
"loss": 12.0629,
"step": 79500
},
{
"epoch": 60.99616858237548,
"grad_norm": 1.0717095136642456,
"learning_rate": 1.1892241379310346e-05,
"loss": 11.7839,
"step": 79600
},
{
"epoch": 61.0,
"eval_loss": 12.74968433380127,
"eval_runtime": 44.1528,
"eval_samples_per_second": 29.556,
"eval_steps_per_second": 3.714,
"step": 79605
},
{
"epoch": 61.0727969348659,
"grad_norm": 2.625854969024658,
"learning_rate": 1.1844348659003831e-05,
"loss": 11.9218,
"step": 79700
},
{
"epoch": 61.14942528735632,
"grad_norm": 1.9146480560302734,
"learning_rate": 1.179645593869732e-05,
"loss": 11.6761,
"step": 79800
},
{
"epoch": 61.22605363984675,
"grad_norm": 0.9696165919303894,
"learning_rate": 1.1748563218390805e-05,
"loss": 11.9288,
"step": 79900
},
{
"epoch": 61.30268199233716,
"grad_norm": 1.1847577095031738,
"learning_rate": 1.1700670498084292e-05,
"loss": 11.9674,
"step": 80000
},
{
"epoch": 61.37931034482759,
"grad_norm": 1.3804477453231812,
"learning_rate": 1.1652777777777778e-05,
"loss": 11.812,
"step": 80100
},
{
"epoch": 61.45593869731801,
"grad_norm": 1.6096410751342773,
"learning_rate": 1.1604885057471265e-05,
"loss": 11.8585,
"step": 80200
},
{
"epoch": 61.53256704980843,
"grad_norm": 1.8098353147506714,
"learning_rate": 1.1556992337164752e-05,
"loss": 11.8667,
"step": 80300
},
{
"epoch": 61.60919540229885,
"grad_norm": 6.6866068840026855,
"learning_rate": 1.1509099616858237e-05,
"loss": 11.8999,
"step": 80400
},
{
"epoch": 61.68582375478927,
"grad_norm": 2.7860629558563232,
"learning_rate": 1.1461206896551726e-05,
"loss": 11.8976,
"step": 80500
},
{
"epoch": 61.76245210727969,
"grad_norm": 1.7936979532241821,
"learning_rate": 1.1413314176245211e-05,
"loss": 11.913,
"step": 80600
},
{
"epoch": 61.839080459770116,
"grad_norm": 1.7207527160644531,
"learning_rate": 1.1365421455938698e-05,
"loss": 12.0002,
"step": 80700
},
{
"epoch": 61.91570881226053,
"grad_norm": 2.8500571250915527,
"learning_rate": 1.1317528735632183e-05,
"loss": 12.0012,
"step": 80800
},
{
"epoch": 61.99233716475096,
"grad_norm": 2.1529831886291504,
"learning_rate": 1.1269636015325672e-05,
"loss": 11.9888,
"step": 80900
},
{
"epoch": 62.0,
"eval_loss": 12.742037773132324,
"eval_runtime": 44.1517,
"eval_samples_per_second": 29.557,
"eval_steps_per_second": 3.714,
"step": 80910
},
{
"epoch": 62.06896551724138,
"grad_norm": 1.1954108476638794,
"learning_rate": 1.1221743295019157e-05,
"loss": 11.9691,
"step": 81000
},
{
"epoch": 62.1455938697318,
"grad_norm": 1.253891944885254,
"learning_rate": 1.1174329501915709e-05,
"loss": 12.0618,
"step": 81100
},
{
"epoch": 62.22222222222222,
"grad_norm": 1.5132429599761963,
"learning_rate": 1.1126436781609197e-05,
"loss": 11.9311,
"step": 81200
},
{
"epoch": 62.298850574712645,
"grad_norm": 1.215069055557251,
"learning_rate": 1.1078544061302683e-05,
"loss": 11.7015,
"step": 81300
},
{
"epoch": 62.37547892720306,
"grad_norm": 2.0881459712982178,
"learning_rate": 1.103065134099617e-05,
"loss": 12.0909,
"step": 81400
},
{
"epoch": 62.452107279693486,
"grad_norm": 1.079714298248291,
"learning_rate": 1.0982758620689655e-05,
"loss": 11.9608,
"step": 81500
},
{
"epoch": 62.52873563218391,
"grad_norm": 1.3947062492370605,
"learning_rate": 1.0934865900383143e-05,
"loss": 11.8452,
"step": 81600
},
{
"epoch": 62.60536398467433,
"grad_norm": 1.0822895765304565,
"learning_rate": 1.0886973180076628e-05,
"loss": 11.8232,
"step": 81700
},
{
"epoch": 62.68199233716475,
"grad_norm": 1.6000736951828003,
"learning_rate": 1.0839080459770115e-05,
"loss": 11.994,
"step": 81800
},
{
"epoch": 62.758620689655174,
"grad_norm": 1.6020923852920532,
"learning_rate": 1.0791187739463602e-05,
"loss": 11.9019,
"step": 81900
},
{
"epoch": 62.83524904214559,
"grad_norm": 1.4164994955062866,
"learning_rate": 1.0743295019157089e-05,
"loss": 11.8139,
"step": 82000
},
{
"epoch": 62.911877394636015,
"grad_norm": 2.334690570831299,
"learning_rate": 1.0695402298850576e-05,
"loss": 12.0714,
"step": 82100
},
{
"epoch": 62.98850574712644,
"grad_norm": 1.8338385820388794,
"learning_rate": 1.0647509578544061e-05,
"loss": 11.8382,
"step": 82200
},
{
"epoch": 63.0,
"eval_loss": 12.733258247375488,
"eval_runtime": 44.1527,
"eval_samples_per_second": 29.557,
"eval_steps_per_second": 3.714,
"step": 82215
},
{
"epoch": 63.065134099616856,
"grad_norm": 3.91227650642395,
"learning_rate": 1.059961685823755e-05,
"loss": 11.9929,
"step": 82300
},
{
"epoch": 63.14176245210728,
"grad_norm": 1.1621551513671875,
"learning_rate": 1.0551724137931035e-05,
"loss": 11.9456,
"step": 82400
},
{
"epoch": 63.2183908045977,
"grad_norm": 1.4154562950134277,
"learning_rate": 1.0503831417624522e-05,
"loss": 12.0645,
"step": 82500
},
{
"epoch": 63.29501915708812,
"grad_norm": 1.8987462520599365,
"learning_rate": 1.0455938697318009e-05,
"loss": 11.873,
"step": 82600
},
{
"epoch": 63.371647509578544,
"grad_norm": 1.8300188779830933,
"learning_rate": 1.0408045977011495e-05,
"loss": 11.7687,
"step": 82700
},
{
"epoch": 63.44827586206897,
"grad_norm": 1.4220359325408936,
"learning_rate": 1.036015325670498e-05,
"loss": 11.8298,
"step": 82800
},
{
"epoch": 63.524904214559385,
"grad_norm": 1.1422735452651978,
"learning_rate": 1.0312260536398468e-05,
"loss": 11.9857,
"step": 82900
},
{
"epoch": 63.60153256704981,
"grad_norm": 1.6723980903625488,
"learning_rate": 1.0264367816091954e-05,
"loss": 11.6692,
"step": 83000
},
{
"epoch": 63.67816091954023,
"grad_norm": 1.3438162803649902,
"learning_rate": 1.0216954022988506e-05,
"loss": 11.8703,
"step": 83100
},
{
"epoch": 63.75478927203065,
"grad_norm": 1.2540138959884644,
"learning_rate": 1.0169061302681993e-05,
"loss": 11.8198,
"step": 83200
},
{
"epoch": 63.83141762452107,
"grad_norm": 1.439274787902832,
"learning_rate": 1.012116858237548e-05,
"loss": 11.8904,
"step": 83300
},
{
"epoch": 63.9080459770115,
"grad_norm": 1.0765241384506226,
"learning_rate": 1.0073275862068967e-05,
"loss": 11.8521,
"step": 83400
},
{
"epoch": 63.984674329501914,
"grad_norm": 1.066419005393982,
"learning_rate": 1.0025383141762452e-05,
"loss": 11.8361,
"step": 83500
},
{
"epoch": 64.0,
"eval_loss": 12.740053176879883,
"eval_runtime": 44.1473,
"eval_samples_per_second": 29.56,
"eval_steps_per_second": 3.715,
"step": 83520
},
{
"epoch": 64.06130268199233,
"grad_norm": 1.2648850679397583,
"learning_rate": 9.977490421455939e-06,
"loss": 12.1,
"step": 83600
},
{
"epoch": 64.13793103448276,
"grad_norm": 1.115157961845398,
"learning_rate": 9.929597701149426e-06,
"loss": 11.798,
"step": 83700
},
{
"epoch": 64.21455938697318,
"grad_norm": 1.6352553367614746,
"learning_rate": 9.881704980842913e-06,
"loss": 11.9761,
"step": 83800
},
{
"epoch": 64.2911877394636,
"grad_norm": 1.2003965377807617,
"learning_rate": 9.833812260536398e-06,
"loss": 11.9813,
"step": 83900
},
{
"epoch": 64.36781609195403,
"grad_norm": 1.5004589557647705,
"learning_rate": 9.785919540229886e-06,
"loss": 11.7826,
"step": 84000
},
{
"epoch": 64.44444444444444,
"grad_norm": 1.3350985050201416,
"learning_rate": 9.738026819923372e-06,
"loss": 11.8015,
"step": 84100
},
{
"epoch": 64.52107279693486,
"grad_norm": 1.5985853672027588,
"learning_rate": 9.690134099616858e-06,
"loss": 11.6736,
"step": 84200
},
{
"epoch": 64.59770114942529,
"grad_norm": 2.1115546226501465,
"learning_rate": 9.642241379310345e-06,
"loss": 11.7572,
"step": 84300
},
{
"epoch": 64.67432950191571,
"grad_norm": 2.5769665241241455,
"learning_rate": 9.594348659003832e-06,
"loss": 11.8057,
"step": 84400
},
{
"epoch": 64.75095785440612,
"grad_norm": 3.2280073165893555,
"learning_rate": 9.546455938697319e-06,
"loss": 11.9184,
"step": 84500
},
{
"epoch": 64.82758620689656,
"grad_norm": 1.2311729192733765,
"learning_rate": 9.498563218390804e-06,
"loss": 11.9657,
"step": 84600
},
{
"epoch": 64.90421455938697,
"grad_norm": 1.6303430795669556,
"learning_rate": 9.450670498084293e-06,
"loss": 11.9864,
"step": 84700
},
{
"epoch": 64.98084291187739,
"grad_norm": 1.6421687602996826,
"learning_rate": 9.402777777777778e-06,
"loss": 11.8224,
"step": 84800
},
{
"epoch": 65.0,
"eval_loss": 12.752345085144043,
"eval_runtime": 44.1763,
"eval_samples_per_second": 29.541,
"eval_steps_per_second": 3.712,
"step": 84825
},
{
"epoch": 65.05747126436782,
"grad_norm": 1.2040326595306396,
"learning_rate": 9.354885057471265e-06,
"loss": 11.7626,
"step": 84900
},
{
"epoch": 65.13409961685824,
"grad_norm": 1.1865389347076416,
"learning_rate": 9.30699233716475e-06,
"loss": 12.015,
"step": 85000
},
{
"epoch": 65.21072796934865,
"grad_norm": 2.0402724742889404,
"learning_rate": 9.259099616858239e-06,
"loss": 11.8473,
"step": 85100
},
{
"epoch": 65.28735632183908,
"grad_norm": 1.8505759239196777,
"learning_rate": 9.21168582375479e-06,
"loss": 11.9353,
"step": 85200
},
{
"epoch": 65.3639846743295,
"grad_norm": 2.3651750087738037,
"learning_rate": 9.163793103448276e-06,
"loss": 12.0637,
"step": 85300
},
{
"epoch": 65.44061302681992,
"grad_norm": 1.9731732606887817,
"learning_rate": 9.115900383141762e-06,
"loss": 12.0013,
"step": 85400
},
{
"epoch": 65.51724137931035,
"grad_norm": 1.3928194046020508,
"learning_rate": 9.06800766283525e-06,
"loss": 11.6937,
"step": 85500
},
{
"epoch": 65.59386973180077,
"grad_norm": 1.580771565437317,
"learning_rate": 9.020114942528736e-06,
"loss": 11.5997,
"step": 85600
},
{
"epoch": 65.67049808429118,
"grad_norm": 1.143648624420166,
"learning_rate": 8.972222222222221e-06,
"loss": 11.948,
"step": 85700
},
{
"epoch": 65.74712643678161,
"grad_norm": 1.9105567932128906,
"learning_rate": 8.92432950191571e-06,
"loss": 11.9796,
"step": 85800
},
{
"epoch": 65.82375478927203,
"grad_norm": 1.3926714658737183,
"learning_rate": 8.876436781609195e-06,
"loss": 11.7775,
"step": 85900
},
{
"epoch": 65.90038314176245,
"grad_norm": 1.1419901847839355,
"learning_rate": 8.828544061302682e-06,
"loss": 11.7615,
"step": 86000
},
{
"epoch": 65.97701149425288,
"grad_norm": 1.6939061880111694,
"learning_rate": 8.780651340996169e-06,
"loss": 11.8244,
"step": 86100
},
{
"epoch": 66.0,
"eval_loss": 12.737361907958984,
"eval_runtime": 44.1505,
"eval_samples_per_second": 29.558,
"eval_steps_per_second": 3.715,
"step": 86130
},
{
"epoch": 66.0536398467433,
"grad_norm": 1.953165054321289,
"learning_rate": 8.732758620689656e-06,
"loss": 11.9442,
"step": 86200
},
{
"epoch": 66.13026819923371,
"grad_norm": 2.1596179008483887,
"learning_rate": 8.684865900383143e-06,
"loss": 11.764,
"step": 86300
},
{
"epoch": 66.20689655172414,
"grad_norm": 1.4609719514846802,
"learning_rate": 8.636973180076628e-06,
"loss": 12.1997,
"step": 86400
},
{
"epoch": 66.28352490421456,
"grad_norm": 2.0631511211395264,
"learning_rate": 8.589080459770116e-06,
"loss": 11.8684,
"step": 86500
},
{
"epoch": 66.36015325670498,
"grad_norm": 1.4530664682388306,
"learning_rate": 8.541187739463602e-06,
"loss": 11.8307,
"step": 86600
},
{
"epoch": 66.4367816091954,
"grad_norm": 2.148606777191162,
"learning_rate": 8.493295019157089e-06,
"loss": 11.9725,
"step": 86700
},
{
"epoch": 66.51340996168582,
"grad_norm": 1.8974863290786743,
"learning_rate": 8.445402298850575e-06,
"loss": 11.9907,
"step": 86800
},
{
"epoch": 66.59003831417624,
"grad_norm": 2.369657278060913,
"learning_rate": 8.397509578544062e-06,
"loss": 11.9563,
"step": 86900
},
{
"epoch": 66.66666666666667,
"grad_norm": 1.6854480504989624,
"learning_rate": 8.349616858237547e-06,
"loss": 11.9173,
"step": 87000
},
{
"epoch": 66.74329501915709,
"grad_norm": 1.6539610624313354,
"learning_rate": 8.301724137931034e-06,
"loss": 11.9584,
"step": 87100
},
{
"epoch": 66.8199233716475,
"grad_norm": 1.346731424331665,
"learning_rate": 8.253831417624521e-06,
"loss": 11.7909,
"step": 87200
},
{
"epoch": 66.89655172413794,
"grad_norm": 1.6548290252685547,
"learning_rate": 8.206417624521073e-06,
"loss": 11.9346,
"step": 87300
},
{
"epoch": 66.97318007662835,
"grad_norm": 1.1189563274383545,
"learning_rate": 8.15852490421456e-06,
"loss": 11.9832,
"step": 87400
},
{
"epoch": 67.0,
"eval_loss": 12.747148513793945,
"eval_runtime": 44.147,
"eval_samples_per_second": 29.56,
"eval_steps_per_second": 3.715,
"step": 87435
},
{
"epoch": 67.04980842911877,
"grad_norm": 1.7302024364471436,
"learning_rate": 8.110632183908045e-06,
"loss": 11.8374,
"step": 87500
},
{
"epoch": 67.1264367816092,
"grad_norm": 0.8793215751647949,
"learning_rate": 8.062739463601534e-06,
"loss": 11.7415,
"step": 87600
},
{
"epoch": 67.20306513409962,
"grad_norm": 1.1903204917907715,
"learning_rate": 8.014846743295019e-06,
"loss": 11.8223,
"step": 87700
},
{
"epoch": 67.27969348659003,
"grad_norm": 2.025223731994629,
"learning_rate": 7.966954022988506e-06,
"loss": 11.7065,
"step": 87800
},
{
"epoch": 67.35632183908046,
"grad_norm": 1.2028359174728394,
"learning_rate": 7.919061302681993e-06,
"loss": 11.9446,
"step": 87900
},
{
"epoch": 67.43295019157088,
"grad_norm": 1.56088387966156,
"learning_rate": 7.87116858237548e-06,
"loss": 12.0176,
"step": 88000
},
{
"epoch": 67.5095785440613,
"grad_norm": 1.4466462135314941,
"learning_rate": 7.823275862068966e-06,
"loss": 11.8777,
"step": 88100
},
{
"epoch": 67.58620689655173,
"grad_norm": 2.2348804473876953,
"learning_rate": 7.775383141762453e-06,
"loss": 11.8506,
"step": 88200
},
{
"epoch": 67.66283524904215,
"grad_norm": 1.0889838933944702,
"learning_rate": 7.72749042145594e-06,
"loss": 11.9706,
"step": 88300
},
{
"epoch": 67.73946360153256,
"grad_norm": 1.6289935111999512,
"learning_rate": 7.679597701149425e-06,
"loss": 11.9588,
"step": 88400
},
{
"epoch": 67.816091954023,
"grad_norm": 1.2480045557022095,
"learning_rate": 7.631704980842912e-06,
"loss": 11.7933,
"step": 88500
},
{
"epoch": 67.89272030651341,
"grad_norm": 1.5679010152816772,
"learning_rate": 7.583812260536399e-06,
"loss": 12.0495,
"step": 88600
},
{
"epoch": 67.96934865900383,
"grad_norm": 1.2820953130722046,
"learning_rate": 7.535919540229885e-06,
"loss": 11.8478,
"step": 88700
},
{
"epoch": 68.0,
"eval_loss": 12.716951370239258,
"eval_runtime": 44.1526,
"eval_samples_per_second": 29.557,
"eval_steps_per_second": 3.714,
"step": 88740
},
{
"epoch": 68.04597701149426,
"grad_norm": 1.0503605604171753,
"learning_rate": 7.488026819923372e-06,
"loss": 11.9092,
"step": 88800
},
{
"epoch": 68.12260536398468,
"grad_norm": 1.5500402450561523,
"learning_rate": 7.440134099616859e-06,
"loss": 11.933,
"step": 88900
},
{
"epoch": 68.19923371647509,
"grad_norm": 2.4164953231811523,
"learning_rate": 7.392241379310346e-06,
"loss": 11.8528,
"step": 89000
},
{
"epoch": 68.27586206896552,
"grad_norm": 1.7877123355865479,
"learning_rate": 7.344348659003832e-06,
"loss": 11.8459,
"step": 89100
},
{
"epoch": 68.35249042145594,
"grad_norm": 1.6601005792617798,
"learning_rate": 7.296455938697318e-06,
"loss": 11.8986,
"step": 89200
},
{
"epoch": 68.42911877394636,
"grad_norm": 1.6431148052215576,
"learning_rate": 7.24904214559387e-06,
"loss": 11.8467,
"step": 89300
},
{
"epoch": 68.50574712643679,
"grad_norm": 1.2147421836853027,
"learning_rate": 7.201149425287357e-06,
"loss": 11.9989,
"step": 89400
},
{
"epoch": 68.5823754789272,
"grad_norm": 1.0646436214447021,
"learning_rate": 7.153256704980843e-06,
"loss": 11.6439,
"step": 89500
},
{
"epoch": 68.65900383141762,
"grad_norm": 1.494936466217041,
"learning_rate": 7.105363984674329e-06,
"loss": 11.8232,
"step": 89600
},
{
"epoch": 68.73563218390805,
"grad_norm": 1.1928653717041016,
"learning_rate": 7.057471264367817e-06,
"loss": 12.032,
"step": 89700
},
{
"epoch": 68.81226053639847,
"grad_norm": 1.2193999290466309,
"learning_rate": 7.009578544061303e-06,
"loss": 11.8999,
"step": 89800
},
{
"epoch": 68.88888888888889,
"grad_norm": 1.418272852897644,
"learning_rate": 6.961685823754789e-06,
"loss": 12.0139,
"step": 89900
},
{
"epoch": 68.96551724137932,
"grad_norm": 2.331040620803833,
"learning_rate": 6.913793103448277e-06,
"loss": 12.0201,
"step": 90000
},
{
"epoch": 69.0,
"eval_loss": 12.731438636779785,
"eval_runtime": 44.1419,
"eval_samples_per_second": 29.564,
"eval_steps_per_second": 3.715,
"step": 90045
},
{
"epoch": 69.04214559386973,
"grad_norm": 1.2469091415405273,
"learning_rate": 6.865900383141763e-06,
"loss": 11.7182,
"step": 90100
},
{
"epoch": 69.11877394636015,
"grad_norm": 1.299902319908142,
"learning_rate": 6.818007662835249e-06,
"loss": 11.908,
"step": 90200
},
{
"epoch": 69.19540229885058,
"grad_norm": 2.0446414947509766,
"learning_rate": 6.770114942528737e-06,
"loss": 11.8736,
"step": 90300
},
{
"epoch": 69.272030651341,
"grad_norm": 2.1058554649353027,
"learning_rate": 6.722222222222223e-06,
"loss": 11.7726,
"step": 90400
},
{
"epoch": 69.34865900383141,
"grad_norm": 1.222571849822998,
"learning_rate": 6.674329501915709e-06,
"loss": 12.1008,
"step": 90500
},
{
"epoch": 69.42528735632185,
"grad_norm": 1.2086107730865479,
"learning_rate": 6.6264367816091955e-06,
"loss": 11.9332,
"step": 90600
},
{
"epoch": 69.50191570881226,
"grad_norm": 1.188658356666565,
"learning_rate": 6.578544061302682e-06,
"loss": 11.9603,
"step": 90700
},
{
"epoch": 69.57854406130268,
"grad_norm": 1.1233985424041748,
"learning_rate": 6.530651340996169e-06,
"loss": 11.7879,
"step": 90800
},
{
"epoch": 69.65517241379311,
"grad_norm": 1.8599299192428589,
"learning_rate": 6.482758620689655e-06,
"loss": 12.0864,
"step": 90900
},
{
"epoch": 69.73180076628353,
"grad_norm": 1.213908076286316,
"learning_rate": 6.434865900383143e-06,
"loss": 11.7091,
"step": 91000
},
{
"epoch": 69.80842911877394,
"grad_norm": 1.2682372331619263,
"learning_rate": 6.386973180076629e-06,
"loss": 11.8762,
"step": 91100
},
{
"epoch": 69.88505747126437,
"grad_norm": 1.940184473991394,
"learning_rate": 6.339080459770115e-06,
"loss": 11.6487,
"step": 91200
},
{
"epoch": 69.96168582375479,
"grad_norm": 1.4338123798370361,
"learning_rate": 6.291187739463601e-06,
"loss": 12.152,
"step": 91300
},
{
"epoch": 70.0,
"eval_loss": 12.735883712768555,
"eval_runtime": 44.179,
"eval_samples_per_second": 29.539,
"eval_steps_per_second": 3.712,
"step": 91350
},
{
"epoch": 70.03831417624521,
"grad_norm": 2.018376111984253,
"learning_rate": 6.243295019157088e-06,
"loss": 11.9978,
"step": 91400
},
{
"epoch": 70.11494252873563,
"grad_norm": 1.4965932369232178,
"learning_rate": 6.195881226053641e-06,
"loss": 11.9588,
"step": 91500
},
{
"epoch": 70.19157088122606,
"grad_norm": 1.5459176301956177,
"learning_rate": 6.147988505747127e-06,
"loss": 11.7513,
"step": 91600
},
{
"epoch": 70.26819923371647,
"grad_norm": 1.6559784412384033,
"learning_rate": 6.1000957854406135e-06,
"loss": 11.8124,
"step": 91700
},
{
"epoch": 70.34482758620689,
"grad_norm": 2.100288152694702,
"learning_rate": 6.0522030651341e-06,
"loss": 11.8001,
"step": 91800
},
{
"epoch": 70.42145593869732,
"grad_norm": 2.0167760848999023,
"learning_rate": 6.0043103448275864e-06,
"loss": 11.7079,
"step": 91900
},
{
"epoch": 70.49808429118774,
"grad_norm": 1.2484099864959717,
"learning_rate": 5.956417624521073e-06,
"loss": 11.8747,
"step": 92000
},
{
"epoch": 70.57471264367815,
"grad_norm": 1.4585705995559692,
"learning_rate": 5.908524904214559e-06,
"loss": 11.6371,
"step": 92100
},
{
"epoch": 70.65134099616859,
"grad_norm": 1.2680083513259888,
"learning_rate": 5.860632183908046e-06,
"loss": 11.8783,
"step": 92200
},
{
"epoch": 70.727969348659,
"grad_norm": 3.2429590225219727,
"learning_rate": 5.812739463601532e-06,
"loss": 12.0867,
"step": 92300
},
{
"epoch": 70.80459770114942,
"grad_norm": 1.6496800184249878,
"learning_rate": 5.764846743295019e-06,
"loss": 11.8665,
"step": 92400
},
{
"epoch": 70.88122605363985,
"grad_norm": 1.7092400789260864,
"learning_rate": 5.716954022988506e-06,
"loss": 11.8957,
"step": 92500
},
{
"epoch": 70.95785440613027,
"grad_norm": 1.308349370956421,
"learning_rate": 5.669061302681993e-06,
"loss": 11.6562,
"step": 92600
},
{
"epoch": 71.0,
"eval_loss": 12.738100051879883,
"eval_runtime": 44.1855,
"eval_samples_per_second": 29.535,
"eval_steps_per_second": 3.712,
"step": 92655
},
{
"epoch": 71.03448275862068,
"grad_norm": 1.4456454515457153,
"learning_rate": 5.62116858237548e-06,
"loss": 11.9577,
"step": 92700
},
{
"epoch": 71.11111111111111,
"grad_norm": 1.178861141204834,
"learning_rate": 5.573275862068966e-06,
"loss": 11.7769,
"step": 92800
},
{
"epoch": 71.18773946360153,
"grad_norm": 1.2721989154815674,
"learning_rate": 5.525383141762453e-06,
"loss": 12.0604,
"step": 92900
},
{
"epoch": 71.26436781609195,
"grad_norm": 1.4360485076904297,
"learning_rate": 5.4774904214559396e-06,
"loss": 11.853,
"step": 93000
},
{
"epoch": 71.34099616858238,
"grad_norm": 1.1324783563613892,
"learning_rate": 5.429597701149426e-06,
"loss": 12.0389,
"step": 93100
},
{
"epoch": 71.4176245210728,
"grad_norm": 1.327430009841919,
"learning_rate": 5.3817049808429125e-06,
"loss": 12.1736,
"step": 93200
},
{
"epoch": 71.49425287356321,
"grad_norm": 1.7536532878875732,
"learning_rate": 5.3338122605363985e-06,
"loss": 11.8394,
"step": 93300
},
{
"epoch": 71.57088122605364,
"grad_norm": 1.2314512729644775,
"learning_rate": 5.285919540229885e-06,
"loss": 11.8958,
"step": 93400
},
{
"epoch": 71.64750957854406,
"grad_norm": 1.3814700841903687,
"learning_rate": 5.2380268199233714e-06,
"loss": 11.8036,
"step": 93500
},
{
"epoch": 71.72413793103448,
"grad_norm": 1.6986061334609985,
"learning_rate": 5.190134099616858e-06,
"loss": 11.7598,
"step": 93600
},
{
"epoch": 71.80076628352491,
"grad_norm": 1.1988410949707031,
"learning_rate": 5.142241379310345e-06,
"loss": 11.7643,
"step": 93700
},
{
"epoch": 71.87739463601532,
"grad_norm": 1.005979061126709,
"learning_rate": 5.094827586206897e-06,
"loss": 11.8694,
"step": 93800
},
{
"epoch": 71.95402298850574,
"grad_norm": 1.8171489238739014,
"learning_rate": 5.046934865900384e-06,
"loss": 11.7541,
"step": 93900
},
{
"epoch": 72.0,
"eval_loss": 12.730957984924316,
"eval_runtime": 44.1811,
"eval_samples_per_second": 29.538,
"eval_steps_per_second": 3.712,
"step": 93960
},
{
"epoch": 72.03065134099617,
"grad_norm": 1.2113227844238281,
"learning_rate": 4.99904214559387e-06,
"loss": 11.8434,
"step": 94000
},
{
"epoch": 72.10727969348659,
"grad_norm": 1.9516360759735107,
"learning_rate": 4.951149425287357e-06,
"loss": 12.0732,
"step": 94100
},
{
"epoch": 72.183908045977,
"grad_norm": 1.6725817918777466,
"learning_rate": 4.903256704980843e-06,
"loss": 11.9187,
"step": 94200
},
{
"epoch": 72.26053639846744,
"grad_norm": 1.5325151681900024,
"learning_rate": 4.85536398467433e-06,
"loss": 11.8286,
"step": 94300
},
{
"epoch": 72.33716475095785,
"grad_norm": 1.4346359968185425,
"learning_rate": 4.807471264367816e-06,
"loss": 11.9449,
"step": 94400
},
{
"epoch": 72.41379310344827,
"grad_norm": 1.8294119834899902,
"learning_rate": 4.7595785440613025e-06,
"loss": 11.7885,
"step": 94500
},
{
"epoch": 72.4904214559387,
"grad_norm": 3.0054831504821777,
"learning_rate": 4.7116858237547894e-06,
"loss": 11.9011,
"step": 94600
},
{
"epoch": 72.56704980842912,
"grad_norm": 3.023944616317749,
"learning_rate": 4.663793103448276e-06,
"loss": 11.7951,
"step": 94700
},
{
"epoch": 72.64367816091954,
"grad_norm": 1.6727356910705566,
"learning_rate": 4.615900383141763e-06,
"loss": 11.6363,
"step": 94800
},
{
"epoch": 72.72030651340997,
"grad_norm": 2.4141032695770264,
"learning_rate": 4.568007662835249e-06,
"loss": 11.8062,
"step": 94900
},
{
"epoch": 72.79693486590038,
"grad_norm": 1.810632348060608,
"learning_rate": 4.520114942528736e-06,
"loss": 11.7885,
"step": 95000
},
{
"epoch": 72.8735632183908,
"grad_norm": 1.2663646936416626,
"learning_rate": 4.472222222222222e-06,
"loss": 11.8532,
"step": 95100
},
{
"epoch": 72.95019157088123,
"grad_norm": 1.1440293788909912,
"learning_rate": 4.424329501915709e-06,
"loss": 11.9398,
"step": 95200
},
{
"epoch": 73.0,
"eval_loss": 12.724422454833984,
"eval_runtime": 44.1981,
"eval_samples_per_second": 29.526,
"eval_steps_per_second": 3.711,
"step": 95265
},
{
"epoch": 73.02681992337165,
"grad_norm": 1.0655268430709839,
"learning_rate": 4.376436781609196e-06,
"loss": 11.9855,
"step": 95300
},
{
"epoch": 73.10344827586206,
"grad_norm": 1.2701817750930786,
"learning_rate": 4.328544061302682e-06,
"loss": 11.7504,
"step": 95400
},
{
"epoch": 73.1800766283525,
"grad_norm": 1.4740400314331055,
"learning_rate": 4.280651340996169e-06,
"loss": 11.8391,
"step": 95500
},
{
"epoch": 73.25670498084291,
"grad_norm": 2.1387853622436523,
"learning_rate": 4.232758620689655e-06,
"loss": 11.8052,
"step": 95600
},
{
"epoch": 73.33333333333333,
"grad_norm": 1.295242190361023,
"learning_rate": 4.184865900383142e-06,
"loss": 11.9859,
"step": 95700
},
{
"epoch": 73.40996168582376,
"grad_norm": 1.4711384773254395,
"learning_rate": 4.136973180076629e-06,
"loss": 12.1523,
"step": 95800
},
{
"epoch": 73.48659003831418,
"grad_norm": 1.7779674530029297,
"learning_rate": 4.089080459770115e-06,
"loss": 11.698,
"step": 95900
},
{
"epoch": 73.5632183908046,
"grad_norm": 2.6070003509521484,
"learning_rate": 4.0411877394636015e-06,
"loss": 11.9877,
"step": 96000
},
{
"epoch": 73.63984674329502,
"grad_norm": 1.4775136709213257,
"learning_rate": 3.993295019157088e-06,
"loss": 11.7928,
"step": 96100
},
{
"epoch": 73.71647509578544,
"grad_norm": 1.7105778455734253,
"learning_rate": 3.945402298850575e-06,
"loss": 12.0444,
"step": 96200
},
{
"epoch": 73.79310344827586,
"grad_norm": 1.6719238758087158,
"learning_rate": 3.897988505747126e-06,
"loss": 11.9407,
"step": 96300
},
{
"epoch": 73.86973180076629,
"grad_norm": 1.312474250793457,
"learning_rate": 3.850095785440613e-06,
"loss": 11.7468,
"step": 96400
},
{
"epoch": 73.9463601532567,
"grad_norm": 0.9431168437004089,
"learning_rate": 3.8022030651340995e-06,
"loss": 11.8737,
"step": 96500
},
{
"epoch": 74.0,
"eval_loss": 12.720576286315918,
"eval_runtime": 44.1482,
"eval_samples_per_second": 29.56,
"eval_steps_per_second": 3.715,
"step": 96570
},
{
"epoch": 74.02298850574712,
"grad_norm": 1.6064398288726807,
"learning_rate": 3.7543103448275864e-06,
"loss": 11.8828,
"step": 96600
},
{
"epoch": 74.09961685823755,
"grad_norm": 2.088803768157959,
"learning_rate": 3.7064176245210733e-06,
"loss": 11.7576,
"step": 96700
},
{
"epoch": 74.17624521072797,
"grad_norm": 1.5417454242706299,
"learning_rate": 3.6585249042145593e-06,
"loss": 11.9239,
"step": 96800
},
{
"epoch": 74.25287356321839,
"grad_norm": 1.5983319282531738,
"learning_rate": 3.610632183908046e-06,
"loss": 11.8119,
"step": 96900
},
{
"epoch": 74.32950191570882,
"grad_norm": 3.7642099857330322,
"learning_rate": 3.5627394636015326e-06,
"loss": 11.8259,
"step": 97000
},
{
"epoch": 74.40613026819923,
"grad_norm": 1.5149072408676147,
"learning_rate": 3.5148467432950195e-06,
"loss": 11.9898,
"step": 97100
},
{
"epoch": 74.48275862068965,
"grad_norm": 0.9915036559104919,
"learning_rate": 3.4669540229885055e-06,
"loss": 11.7665,
"step": 97200
},
{
"epoch": 74.55938697318008,
"grad_norm": 1.2745176553726196,
"learning_rate": 3.4190613026819924e-06,
"loss": 11.9657,
"step": 97300
},
{
"epoch": 74.6360153256705,
"grad_norm": 2.390751600265503,
"learning_rate": 3.3711685823754793e-06,
"loss": 11.6856,
"step": 97400
},
{
"epoch": 74.71264367816092,
"grad_norm": 2.2279295921325684,
"learning_rate": 3.3232758620689653e-06,
"loss": 11.7551,
"step": 97500
},
{
"epoch": 74.78927203065135,
"grad_norm": 1.8389006853103638,
"learning_rate": 3.275383141762452e-06,
"loss": 12.0037,
"step": 97600
},
{
"epoch": 74.86590038314176,
"grad_norm": 1.4288936853408813,
"learning_rate": 3.2274904214559387e-06,
"loss": 12.0561,
"step": 97700
},
{
"epoch": 74.94252873563218,
"grad_norm": 1.037800669670105,
"learning_rate": 3.1795977011494255e-06,
"loss": 11.9257,
"step": 97800
},
{
"epoch": 75.0,
"eval_loss": 12.724896430969238,
"eval_runtime": 44.1538,
"eval_samples_per_second": 29.556,
"eval_steps_per_second": 3.714,
"step": 97875
},
{
"epoch": 75.01915708812261,
"grad_norm": 0.9783554673194885,
"learning_rate": 3.1317049808429124e-06,
"loss": 11.7455,
"step": 97900
},
{
"epoch": 75.09578544061303,
"grad_norm": 1.4434301853179932,
"learning_rate": 3.0838122605363985e-06,
"loss": 11.99,
"step": 98000
},
{
"epoch": 75.17241379310344,
"grad_norm": 1.2560200691223145,
"learning_rate": 3.035919540229885e-06,
"loss": 11.8445,
"step": 98100
},
{
"epoch": 75.24904214559388,
"grad_norm": 1.123687982559204,
"learning_rate": 2.988026819923372e-06,
"loss": 11.8894,
"step": 98200
},
{
"epoch": 75.32567049808429,
"grad_norm": 1.2393250465393066,
"learning_rate": 2.9401340996168583e-06,
"loss": 11.7591,
"step": 98300
},
{
"epoch": 75.40229885057471,
"grad_norm": 2.023070812225342,
"learning_rate": 2.892241379310345e-06,
"loss": 11.7083,
"step": 98400
},
{
"epoch": 75.47892720306514,
"grad_norm": 1.7746585607528687,
"learning_rate": 2.8443486590038316e-06,
"loss": 12.0237,
"step": 98500
},
{
"epoch": 75.55555555555556,
"grad_norm": 1.6215800046920776,
"learning_rate": 2.796455938697318e-06,
"loss": 11.8271,
"step": 98600
},
{
"epoch": 75.63218390804597,
"grad_norm": 2.3727614879608154,
"learning_rate": 2.7490421455938698e-06,
"loss": 11.9133,
"step": 98700
},
{
"epoch": 75.7088122605364,
"grad_norm": 1.562569260597229,
"learning_rate": 2.7011494252873562e-06,
"loss": 11.8886,
"step": 98800
},
{
"epoch": 75.78544061302682,
"grad_norm": 0.8996521830558777,
"learning_rate": 2.653256704980843e-06,
"loss": 11.6606,
"step": 98900
},
{
"epoch": 75.86206896551724,
"grad_norm": 1.6331411600112915,
"learning_rate": 2.6053639846743296e-06,
"loss": 12.057,
"step": 99000
},
{
"epoch": 75.93869731800767,
"grad_norm": 1.2690104246139526,
"learning_rate": 2.5574712643678165e-06,
"loss": 11.9791,
"step": 99100
},
{
"epoch": 76.0,
"eval_loss": 12.717323303222656,
"eval_runtime": 44.1546,
"eval_samples_per_second": 29.555,
"eval_steps_per_second": 3.714,
"step": 99180
},
{
"epoch": 76.01532567049809,
"grad_norm": 1.737823724746704,
"learning_rate": 2.509578544061303e-06,
"loss": 11.8981,
"step": 99200
},
{
"epoch": 76.0919540229885,
"grad_norm": 1.0878353118896484,
"learning_rate": 2.4616858237547894e-06,
"loss": 11.8443,
"step": 99300
},
{
"epoch": 76.16858237547893,
"grad_norm": 2.0454564094543457,
"learning_rate": 2.413793103448276e-06,
"loss": 11.8515,
"step": 99400
},
{
"epoch": 76.24521072796935,
"grad_norm": 1.3210684061050415,
"learning_rate": 2.3659003831417623e-06,
"loss": 12.0233,
"step": 99500
},
{
"epoch": 76.32183908045977,
"grad_norm": 1.1547104120254517,
"learning_rate": 2.318007662835249e-06,
"loss": 11.7145,
"step": 99600
},
{
"epoch": 76.3984674329502,
"grad_norm": 1.3948626518249512,
"learning_rate": 2.270114942528736e-06,
"loss": 11.7098,
"step": 99700
},
{
"epoch": 76.47509578544062,
"grad_norm": 1.2874501943588257,
"learning_rate": 2.2222222222222225e-06,
"loss": 11.8953,
"step": 99800
},
{
"epoch": 76.55172413793103,
"grad_norm": 1.8570905923843384,
"learning_rate": 2.174329501915709e-06,
"loss": 11.9397,
"step": 99900
},
{
"epoch": 76.62835249042146,
"grad_norm": 1.3673057556152344,
"learning_rate": 2.1264367816091954e-06,
"loss": 11.8056,
"step": 100000
},
{
"epoch": 76.70498084291188,
"grad_norm": 2.1938419342041016,
"learning_rate": 2.078544061302682e-06,
"loss": 11.9414,
"step": 100100
},
{
"epoch": 76.7816091954023,
"grad_norm": 1.9171061515808105,
"learning_rate": 2.0306513409961687e-06,
"loss": 11.8369,
"step": 100200
},
{
"epoch": 76.85823754789271,
"grad_norm": 1.0486401319503784,
"learning_rate": 1.982758620689655e-06,
"loss": 11.8322,
"step": 100300
},
{
"epoch": 76.93486590038314,
"grad_norm": 1.6005215644836426,
"learning_rate": 1.934865900383142e-06,
"loss": 11.8781,
"step": 100400
},
{
"epoch": 77.0,
"eval_loss": 12.72097396850586,
"eval_runtime": 44.1751,
"eval_samples_per_second": 29.542,
"eval_steps_per_second": 3.712,
"step": 100485
}
],
"logging_steps": 100,
"max_steps": 104400,
"num_input_tokens_seen": 0,
"num_train_epochs": 80,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 9
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.681650983960218e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}