{ "best_metric": 12.716951370239258, "best_model_checkpoint": "/kaggle/working/output/checkpoint-88740", "epoch": 77.0, "eval_steps": 500, "global_step": 100485, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07662835249042145, "grad_norm": 8.174947738647461, "learning_rate": 4.9952586206896554e-05, "loss": 96.5258, "step": 100 }, { "epoch": 0.1532567049808429, "grad_norm": 8.584559440612793, "learning_rate": 4.990469348659004e-05, "loss": 48.0822, "step": 200 }, { "epoch": 0.22988505747126436, "grad_norm": 8.02587604522705, "learning_rate": 4.985680076628353e-05, "loss": 31.9469, "step": 300 }, { "epoch": 0.3065134099616858, "grad_norm": 6.968703746795654, "learning_rate": 4.9808908045977015e-05, "loss": 24.973, "step": 400 }, { "epoch": 0.3831417624521073, "grad_norm": 6.017839431762695, "learning_rate": 4.97610153256705e-05, "loss": 20.7473, "step": 500 }, { "epoch": 0.45977011494252873, "grad_norm": 4.75618839263916, "learning_rate": 4.971312260536399e-05, "loss": 18.6219, "step": 600 }, { "epoch": 0.5363984674329502, "grad_norm": 3.5624868869781494, "learning_rate": 4.9665229885057475e-05, "loss": 17.1775, "step": 700 }, { "epoch": 0.6130268199233716, "grad_norm": 2.889848470687866, "learning_rate": 4.961733716475096e-05, "loss": 16.1131, "step": 800 }, { "epoch": 0.6896551724137931, "grad_norm": 8.15518856048584, "learning_rate": 4.956944444444445e-05, "loss": 15.8697, "step": 900 }, { "epoch": 0.7662835249042146, "grad_norm": 3.092848539352417, "learning_rate": 4.952155172413793e-05, "loss": 15.5523, "step": 1000 }, { "epoch": 0.842911877394636, "grad_norm": 2.181015968322754, "learning_rate": 4.9473659003831416e-05, "loss": 15.5628, "step": 1100 }, { "epoch": 0.9195402298850575, "grad_norm": 2.1515514850616455, "learning_rate": 4.94257662835249e-05, "loss": 15.3004, "step": 1200 }, { "epoch": 0.9961685823754789, "grad_norm": 1.476803183555603, "learning_rate": 4.937787356321839e-05, "loss": 15.3448, "step": 1300 }, { "epoch": 1.0, "eval_loss": 15.641121864318848, "eval_runtime": 44.0061, "eval_samples_per_second": 29.655, "eval_steps_per_second": 3.727, "step": 1305 }, { "epoch": 1.0727969348659003, "grad_norm": 3.050917863845825, "learning_rate": 4.932998084291188e-05, "loss": 14.901, "step": 1400 }, { "epoch": 1.1494252873563218, "grad_norm": 1.6784011125564575, "learning_rate": 4.928208812260537e-05, "loss": 14.7073, "step": 1500 }, { "epoch": 1.2260536398467432, "grad_norm": 3.2630977630615234, "learning_rate": 4.923419540229886e-05, "loss": 14.9142, "step": 1600 }, { "epoch": 1.3026819923371646, "grad_norm": 1.6106696128845215, "learning_rate": 4.9186302681992344e-05, "loss": 14.9731, "step": 1700 }, { "epoch": 1.3793103448275863, "grad_norm": 4.378266334533691, "learning_rate": 4.9138409961685824e-05, "loss": 14.5922, "step": 1800 }, { "epoch": 1.4559386973180077, "grad_norm": 2.196368455886841, "learning_rate": 4.909051724137931e-05, "loss": 15.024, "step": 1900 }, { "epoch": 1.5325670498084292, "grad_norm": 1.1820286512374878, "learning_rate": 4.90426245210728e-05, "loss": 14.6291, "step": 2000 }, { "epoch": 1.6091954022988506, "grad_norm": 2.6733219623565674, "learning_rate": 4.8994731800766285e-05, "loss": 15.1916, "step": 2100 }, { "epoch": 1.685823754789272, "grad_norm": 2.461630344390869, "learning_rate": 4.894683908045977e-05, "loss": 14.7438, "step": 2200 }, { "epoch": 1.7624521072796935, "grad_norm": 1.7039703130722046, "learning_rate": 4.889894636015326e-05, "loss": 14.3014, "step": 2300 }, { "epoch": 1.839080459770115, "grad_norm": 2.291198253631592, "learning_rate": 4.8851053639846746e-05, "loss": 14.5648, "step": 2400 }, { "epoch": 1.9157088122605364, "grad_norm": 2.088695764541626, "learning_rate": 4.880316091954023e-05, "loss": 14.2778, "step": 2500 }, { "epoch": 1.9923371647509578, "grad_norm": 1.9745572805404663, "learning_rate": 4.875526819923372e-05, "loss": 14.612, "step": 2600 }, { "epoch": 2.0, "eval_loss": 14.947260856628418, "eval_runtime": 44.059, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.722, "step": 2610 }, { "epoch": 2.0689655172413794, "grad_norm": 3.296757698059082, "learning_rate": 4.8707375478927206e-05, "loss": 14.4268, "step": 2700 }, { "epoch": 2.1455938697318007, "grad_norm": 1.2265104055404663, "learning_rate": 4.865948275862069e-05, "loss": 14.3716, "step": 2800 }, { "epoch": 2.2222222222222223, "grad_norm": 2.3575916290283203, "learning_rate": 4.861159003831418e-05, "loss": 14.2911, "step": 2900 }, { "epoch": 2.2988505747126435, "grad_norm": 1.535346508026123, "learning_rate": 4.856369731800767e-05, "loss": 14.0469, "step": 3000 }, { "epoch": 2.375478927203065, "grad_norm": 2.3857269287109375, "learning_rate": 4.8515804597701154e-05, "loss": 14.0246, "step": 3100 }, { "epoch": 2.4521072796934864, "grad_norm": 1.46570885181427, "learning_rate": 4.846791187739464e-05, "loss": 14.0864, "step": 3200 }, { "epoch": 2.528735632183908, "grad_norm": 1.3398170471191406, "learning_rate": 4.842001915708813e-05, "loss": 14.1075, "step": 3300 }, { "epoch": 2.6053639846743293, "grad_norm": 1.4247232675552368, "learning_rate": 4.8372126436781614e-05, "loss": 13.9681, "step": 3400 }, { "epoch": 2.681992337164751, "grad_norm": 1.602295160293579, "learning_rate": 4.83242337164751e-05, "loss": 14.0847, "step": 3500 }, { "epoch": 2.7586206896551726, "grad_norm": 1.8135626316070557, "learning_rate": 4.827634099616858e-05, "loss": 13.9871, "step": 3600 }, { "epoch": 2.835249042145594, "grad_norm": 2.3612937927246094, "learning_rate": 4.822844827586207e-05, "loss": 14.043, "step": 3700 }, { "epoch": 2.9118773946360155, "grad_norm": 2.1295549869537354, "learning_rate": 4.8180555555555555e-05, "loss": 14.0695, "step": 3800 }, { "epoch": 2.9885057471264367, "grad_norm": 2.768362283706665, "learning_rate": 4.813266283524904e-05, "loss": 13.8804, "step": 3900 }, { "epoch": 3.0, "eval_loss": 14.543105125427246, "eval_runtime": 44.0531, "eval_samples_per_second": 29.623, "eval_steps_per_second": 3.723, "step": 3915 }, { "epoch": 3.0651340996168583, "grad_norm": 2.190544366836548, "learning_rate": 4.808477011494253e-05, "loss": 13.8831, "step": 4000 }, { "epoch": 3.1417624521072796, "grad_norm": 1.6555811166763306, "learning_rate": 4.8036877394636016e-05, "loss": 13.661, "step": 4100 }, { "epoch": 3.218390804597701, "grad_norm": 1.1204612255096436, "learning_rate": 4.798898467432951e-05, "loss": 13.9753, "step": 4200 }, { "epoch": 3.2950191570881224, "grad_norm": 2.3801109790802, "learning_rate": 4.7941091954022996e-05, "loss": 13.9332, "step": 4300 }, { "epoch": 3.371647509578544, "grad_norm": 1.314393162727356, "learning_rate": 4.7893199233716476e-05, "loss": 13.8442, "step": 4400 }, { "epoch": 3.4482758620689653, "grad_norm": 2.0551559925079346, "learning_rate": 4.784530651340996e-05, "loss": 13.5678, "step": 4500 }, { "epoch": 3.524904214559387, "grad_norm": 1.4303470849990845, "learning_rate": 4.779741379310345e-05, "loss": 13.7754, "step": 4600 }, { "epoch": 3.6015325670498086, "grad_norm": 2.2181780338287354, "learning_rate": 4.774952107279694e-05, "loss": 13.5568, "step": 4700 }, { "epoch": 3.67816091954023, "grad_norm": 1.377549648284912, "learning_rate": 4.7701628352490424e-05, "loss": 13.4359, "step": 4800 }, { "epoch": 3.7547892720306515, "grad_norm": 1.6644877195358276, "learning_rate": 4.765373563218391e-05, "loss": 13.6701, "step": 4900 }, { "epoch": 3.8314176245210727, "grad_norm": 1.6416462659835815, "learning_rate": 4.76058429118774e-05, "loss": 13.6427, "step": 5000 }, { "epoch": 3.9080459770114944, "grad_norm": 1.5726954936981201, "learning_rate": 4.7557950191570885e-05, "loss": 13.6802, "step": 5100 }, { "epoch": 3.9846743295019156, "grad_norm": 1.3120722770690918, "learning_rate": 4.751005747126437e-05, "loss": 13.6631, "step": 5200 }, { "epoch": 4.0, "eval_loss": 14.28848934173584, "eval_runtime": 44.0456, "eval_samples_per_second": 29.628, "eval_steps_per_second": 3.723, "step": 5220 }, { "epoch": 4.061302681992337, "grad_norm": 1.9124590158462524, "learning_rate": 4.746216475095785e-05, "loss": 13.5388, "step": 5300 }, { "epoch": 4.137931034482759, "grad_norm": 1.3689558506011963, "learning_rate": 4.741427203065134e-05, "loss": 13.5553, "step": 5400 }, { "epoch": 4.21455938697318, "grad_norm": 1.6370700597763062, "learning_rate": 4.7366379310344825e-05, "loss": 13.5781, "step": 5500 }, { "epoch": 4.291187739463601, "grad_norm": 1.993304967880249, "learning_rate": 4.731848659003832e-05, "loss": 13.5261, "step": 5600 }, { "epoch": 4.3678160919540225, "grad_norm": 2.3975770473480225, "learning_rate": 4.7270593869731806e-05, "loss": 13.4305, "step": 5700 }, { "epoch": 4.444444444444445, "grad_norm": 1.9231036901474, "learning_rate": 4.722270114942529e-05, "loss": 13.3994, "step": 5800 }, { "epoch": 4.521072796934866, "grad_norm": 1.0928981304168701, "learning_rate": 4.717480842911878e-05, "loss": 13.3212, "step": 5900 }, { "epoch": 4.597701149425287, "grad_norm": 1.3092130422592163, "learning_rate": 4.7126915708812266e-05, "loss": 13.4476, "step": 6000 }, { "epoch": 4.674329501915709, "grad_norm": 2.0151021480560303, "learning_rate": 4.7079022988505747e-05, "loss": 13.1863, "step": 6100 }, { "epoch": 4.75095785440613, "grad_norm": 1.2778387069702148, "learning_rate": 4.7031130268199233e-05, "loss": 13.3661, "step": 6200 }, { "epoch": 4.827586206896552, "grad_norm": 1.1671264171600342, "learning_rate": 4.698371647509579e-05, "loss": 13.3803, "step": 6300 }, { "epoch": 4.904214559386973, "grad_norm": 0.9788312911987305, "learning_rate": 4.693582375478928e-05, "loss": 13.495, "step": 6400 }, { "epoch": 4.980842911877395, "grad_norm": 3.2978639602661133, "learning_rate": 4.6887931034482766e-05, "loss": 13.4834, "step": 6500 }, { "epoch": 5.0, "eval_loss": 14.041104316711426, "eval_runtime": 43.9982, "eval_samples_per_second": 29.66, "eval_steps_per_second": 3.727, "step": 6525 }, { "epoch": 5.057471264367816, "grad_norm": 1.6198067665100098, "learning_rate": 4.6840038314176246e-05, "loss": 13.1646, "step": 6600 }, { "epoch": 5.134099616858237, "grad_norm": 5.732328414916992, "learning_rate": 4.679214559386973e-05, "loss": 13.4168, "step": 6700 }, { "epoch": 5.210727969348659, "grad_norm": 1.518420934677124, "learning_rate": 4.674425287356322e-05, "loss": 13.2907, "step": 6800 }, { "epoch": 5.287356321839081, "grad_norm": 1.6062932014465332, "learning_rate": 4.6696360153256706e-05, "loss": 13.406, "step": 6900 }, { "epoch": 5.363984674329502, "grad_norm": 2.5659947395324707, "learning_rate": 4.664846743295019e-05, "loss": 13.252, "step": 7000 }, { "epoch": 5.440613026819923, "grad_norm": 1.4965115785598755, "learning_rate": 4.660057471264368e-05, "loss": 13.2683, "step": 7100 }, { "epoch": 5.517241379310345, "grad_norm": 2.3210604190826416, "learning_rate": 4.655268199233717e-05, "loss": 13.1846, "step": 7200 }, { "epoch": 5.593869731800766, "grad_norm": 1.508138656616211, "learning_rate": 4.6504789272030654e-05, "loss": 13.1303, "step": 7300 }, { "epoch": 5.670498084291188, "grad_norm": 1.2769402265548706, "learning_rate": 4.645689655172414e-05, "loss": 13.1109, "step": 7400 }, { "epoch": 5.747126436781609, "grad_norm": 3.0062999725341797, "learning_rate": 4.640900383141763e-05, "loss": 13.1859, "step": 7500 }, { "epoch": 5.823754789272031, "grad_norm": 1.4893639087677002, "learning_rate": 4.636111111111111e-05, "loss": 13.2236, "step": 7600 }, { "epoch": 5.900383141762452, "grad_norm": 1.9955596923828125, "learning_rate": 4.63132183908046e-05, "loss": 13.2806, "step": 7700 }, { "epoch": 5.977011494252873, "grad_norm": 1.733920931816101, "learning_rate": 4.626532567049809e-05, "loss": 12.9426, "step": 7800 }, { "epoch": 6.0, "eval_loss": 13.950128555297852, "eval_runtime": 44.0078, "eval_samples_per_second": 29.654, "eval_steps_per_second": 3.727, "step": 7830 }, { "epoch": 6.053639846743295, "grad_norm": 1.3697247505187988, "learning_rate": 4.6217432950191575e-05, "loss": 13.001, "step": 7900 }, { "epoch": 6.130268199233717, "grad_norm": 1.7222646474838257, "learning_rate": 4.616954022988506e-05, "loss": 13.1098, "step": 8000 }, { "epoch": 6.206896551724138, "grad_norm": 1.5488767623901367, "learning_rate": 4.612164750957855e-05, "loss": 13.2406, "step": 8100 }, { "epoch": 6.283524904214559, "grad_norm": 1.1356619596481323, "learning_rate": 4.6073754789272036e-05, "loss": 13.0969, "step": 8200 }, { "epoch": 6.360153256704981, "grad_norm": 2.161534547805786, "learning_rate": 4.602586206896552e-05, "loss": 12.8021, "step": 8300 }, { "epoch": 6.436781609195402, "grad_norm": 1.42888605594635, "learning_rate": 4.5977969348659e-05, "loss": 13.007, "step": 8400 }, { "epoch": 6.513409961685824, "grad_norm": 1.5181623697280884, "learning_rate": 4.593007662835249e-05, "loss": 13.2494, "step": 8500 }, { "epoch": 6.590038314176245, "grad_norm": 2.6794161796569824, "learning_rate": 4.588218390804598e-05, "loss": 13.0472, "step": 8600 }, { "epoch": 6.666666666666667, "grad_norm": 1.3213189840316772, "learning_rate": 4.5834291187739464e-05, "loss": 12.7648, "step": 8700 }, { "epoch": 6.743295019157088, "grad_norm": 1.1679490804672241, "learning_rate": 4.578639846743295e-05, "loss": 13.0907, "step": 8800 }, { "epoch": 6.819923371647509, "grad_norm": 1.7697467803955078, "learning_rate": 4.573850574712644e-05, "loss": 12.8777, "step": 8900 }, { "epoch": 6.896551724137931, "grad_norm": 1.7574371099472046, "learning_rate": 4.5690613026819924e-05, "loss": 12.8949, "step": 9000 }, { "epoch": 6.973180076628353, "grad_norm": 1.8508405685424805, "learning_rate": 4.564272030651342e-05, "loss": 13.0364, "step": 9100 }, { "epoch": 7.0, "eval_loss": 13.742591857910156, "eval_runtime": 44.1082, "eval_samples_per_second": 29.586, "eval_steps_per_second": 3.718, "step": 9135 }, { "epoch": 7.049808429118774, "grad_norm": 1.304430365562439, "learning_rate": 4.55948275862069e-05, "loss": 13.1197, "step": 9200 }, { "epoch": 7.126436781609195, "grad_norm": 1.112478256225586, "learning_rate": 4.5546934865900385e-05, "loss": 13.072, "step": 9300 }, { "epoch": 7.203065134099617, "grad_norm": 1.6277681589126587, "learning_rate": 4.5499521072796937e-05, "loss": 12.8787, "step": 9400 }, { "epoch": 7.2796934865900385, "grad_norm": 1.6854459047317505, "learning_rate": 4.5451628352490423e-05, "loss": 12.9961, "step": 9500 }, { "epoch": 7.35632183908046, "grad_norm": 1.5988355875015259, "learning_rate": 4.540373563218391e-05, "loss": 12.9588, "step": 9600 }, { "epoch": 7.432950191570881, "grad_norm": 1.0676491260528564, "learning_rate": 4.53558429118774e-05, "loss": 12.8359, "step": 9700 }, { "epoch": 7.509578544061303, "grad_norm": 1.8556437492370605, "learning_rate": 4.5307950191570884e-05, "loss": 12.813, "step": 9800 }, { "epoch": 7.586206896551724, "grad_norm": 1.5877550840377808, "learning_rate": 4.526005747126437e-05, "loss": 12.9205, "step": 9900 }, { "epoch": 7.662835249042145, "grad_norm": 1.2095483541488647, "learning_rate": 4.521216475095786e-05, "loss": 12.9472, "step": 10000 }, { "epoch": 7.739463601532567, "grad_norm": 3.998228073120117, "learning_rate": 4.5164272030651345e-05, "loss": 12.871, "step": 10100 }, { "epoch": 7.816091954022989, "grad_norm": 1.4408106803894043, "learning_rate": 4.511637931034483e-05, "loss": 12.9723, "step": 10200 }, { "epoch": 7.89272030651341, "grad_norm": 0.9685239791870117, "learning_rate": 4.506848659003832e-05, "loss": 12.7816, "step": 10300 }, { "epoch": 7.969348659003831, "grad_norm": 2.4164698123931885, "learning_rate": 4.5020593869731805e-05, "loss": 12.8656, "step": 10400 }, { "epoch": 8.0, "eval_loss": 13.643902778625488, "eval_runtime": 44.1312, "eval_samples_per_second": 29.571, "eval_steps_per_second": 3.716, "step": 10440 }, { "epoch": 8.045977011494253, "grad_norm": 1.4973284006118774, "learning_rate": 4.497270114942529e-05, "loss": 12.9654, "step": 10500 }, { "epoch": 8.122605363984674, "grad_norm": 1.9837547540664673, "learning_rate": 4.492480842911877e-05, "loss": 12.9358, "step": 10600 }, { "epoch": 8.199233716475096, "grad_norm": 2.1501142978668213, "learning_rate": 4.487691570881226e-05, "loss": 12.9226, "step": 10700 }, { "epoch": 8.275862068965518, "grad_norm": 1.959155797958374, "learning_rate": 4.4829022988505746e-05, "loss": 12.8136, "step": 10800 }, { "epoch": 8.352490421455938, "grad_norm": 1.7081148624420166, "learning_rate": 4.478113026819923e-05, "loss": 12.6215, "step": 10900 }, { "epoch": 8.42911877394636, "grad_norm": 3.0818092823028564, "learning_rate": 4.473323754789272e-05, "loss": 12.7263, "step": 11000 }, { "epoch": 8.505747126436782, "grad_norm": 1.2609460353851318, "learning_rate": 4.468534482758621e-05, "loss": 12.615, "step": 11100 }, { "epoch": 8.582375478927203, "grad_norm": 1.1553901433944702, "learning_rate": 4.46374521072797e-05, "loss": 12.9115, "step": 11200 }, { "epoch": 8.659003831417625, "grad_norm": 2.876321792602539, "learning_rate": 4.458955938697319e-05, "loss": 12.8372, "step": 11300 }, { "epoch": 8.735632183908045, "grad_norm": 2.3537096977233887, "learning_rate": 4.454166666666667e-05, "loss": 12.8684, "step": 11400 }, { "epoch": 8.812260536398467, "grad_norm": 1.4264323711395264, "learning_rate": 4.4493773946360154e-05, "loss": 12.6151, "step": 11500 }, { "epoch": 8.88888888888889, "grad_norm": 1.8997728824615479, "learning_rate": 4.4446360153256706e-05, "loss": 12.8187, "step": 11600 }, { "epoch": 8.96551724137931, "grad_norm": 1.8338580131530762, "learning_rate": 4.439846743295019e-05, "loss": 12.7365, "step": 11700 }, { "epoch": 9.0, "eval_loss": 13.53819465637207, "eval_runtime": 44.0314, "eval_samples_per_second": 29.638, "eval_steps_per_second": 3.725, "step": 11745 }, { "epoch": 9.042145593869732, "grad_norm": 12.737005233764648, "learning_rate": 4.4351053639846745e-05, "loss": 12.8002, "step": 11800 }, { "epoch": 9.118773946360154, "grad_norm": 1.8820631504058838, "learning_rate": 4.430316091954023e-05, "loss": 12.8415, "step": 11900 }, { "epoch": 9.195402298850574, "grad_norm": 1.5012093782424927, "learning_rate": 4.425526819923372e-05, "loss": 12.8011, "step": 12000 }, { "epoch": 9.272030651340996, "grad_norm": 2.5062639713287354, "learning_rate": 4.4207375478927205e-05, "loss": 12.7156, "step": 12100 }, { "epoch": 9.348659003831418, "grad_norm": 1.5295358896255493, "learning_rate": 4.415948275862069e-05, "loss": 12.8449, "step": 12200 }, { "epoch": 9.425287356321839, "grad_norm": 1.6232823133468628, "learning_rate": 4.411159003831418e-05, "loss": 12.7345, "step": 12300 }, { "epoch": 9.50191570881226, "grad_norm": 1.4783318042755127, "learning_rate": 4.4063697318007666e-05, "loss": 12.7392, "step": 12400 }, { "epoch": 9.578544061302683, "grad_norm": 1.7494572401046753, "learning_rate": 4.4015804597701146e-05, "loss": 12.6017, "step": 12500 }, { "epoch": 9.655172413793103, "grad_norm": 2.065991163253784, "learning_rate": 4.396791187739464e-05, "loss": 12.695, "step": 12600 }, { "epoch": 9.731800766283525, "grad_norm": 1.2360838651657104, "learning_rate": 4.3920019157088127e-05, "loss": 12.7994, "step": 12700 }, { "epoch": 9.808429118773946, "grad_norm": 2.084902048110962, "learning_rate": 4.3872126436781613e-05, "loss": 12.6864, "step": 12800 }, { "epoch": 9.885057471264368, "grad_norm": 1.4381409883499146, "learning_rate": 4.38242337164751e-05, "loss": 12.6875, "step": 12900 }, { "epoch": 9.96168582375479, "grad_norm": 1.5936471223831177, "learning_rate": 4.377634099616859e-05, "loss": 12.6413, "step": 13000 }, { "epoch": 10.0, "eval_loss": 13.456477165222168, "eval_runtime": 44.0741, "eval_samples_per_second": 29.609, "eval_steps_per_second": 3.721, "step": 13050 }, { "epoch": 10.03831417624521, "grad_norm": 1.1829323768615723, "learning_rate": 4.3728448275862074e-05, "loss": 12.7182, "step": 13100 }, { "epoch": 10.114942528735632, "grad_norm": 1.7679022550582886, "learning_rate": 4.368055555555556e-05, "loss": 12.7508, "step": 13200 }, { "epoch": 10.191570881226054, "grad_norm": 2.4053192138671875, "learning_rate": 4.363266283524904e-05, "loss": 12.5668, "step": 13300 }, { "epoch": 10.268199233716475, "grad_norm": 2.4858756065368652, "learning_rate": 4.358477011494253e-05, "loss": 12.6561, "step": 13400 }, { "epoch": 10.344827586206897, "grad_norm": 2.138453483581543, "learning_rate": 4.3536877394636015e-05, "loss": 12.6829, "step": 13500 }, { "epoch": 10.421455938697317, "grad_norm": 1.490075707435608, "learning_rate": 4.34889846743295e-05, "loss": 12.7284, "step": 13600 }, { "epoch": 10.49808429118774, "grad_norm": 3.1338703632354736, "learning_rate": 4.344109195402299e-05, "loss": 12.5722, "step": 13700 }, { "epoch": 10.574712643678161, "grad_norm": 1.844388723373413, "learning_rate": 4.3393199233716475e-05, "loss": 12.8212, "step": 13800 }, { "epoch": 10.651340996168582, "grad_norm": 1.9379137754440308, "learning_rate": 4.334530651340996e-05, "loss": 12.368, "step": 13900 }, { "epoch": 10.727969348659004, "grad_norm": 4.608842849731445, "learning_rate": 4.3297413793103456e-05, "loss": 12.3258, "step": 14000 }, { "epoch": 10.804597701149426, "grad_norm": 1.607155680656433, "learning_rate": 4.325e-05, "loss": 12.8355, "step": 14100 }, { "epoch": 10.881226053639846, "grad_norm": 1.7595943212509155, "learning_rate": 4.320210727969349e-05, "loss": 12.6135, "step": 14200 }, { "epoch": 10.957854406130268, "grad_norm": 1.7879704236984253, "learning_rate": 4.3154214559386975e-05, "loss": 12.7107, "step": 14300 }, { "epoch": 11.0, "eval_loss": 13.364398002624512, "eval_runtime": 44.0273, "eval_samples_per_second": 29.641, "eval_steps_per_second": 3.725, "step": 14355 }, { "epoch": 11.03448275862069, "grad_norm": 3.187349557876587, "learning_rate": 4.310632183908046e-05, "loss": 12.7471, "step": 14400 }, { "epoch": 11.11111111111111, "grad_norm": 3.118311643600464, "learning_rate": 4.305842911877395e-05, "loss": 12.4422, "step": 14500 }, { "epoch": 11.187739463601533, "grad_norm": 2.276580333709717, "learning_rate": 4.3010536398467435e-05, "loss": 12.5443, "step": 14600 }, { "epoch": 11.264367816091955, "grad_norm": 1.3369340896606445, "learning_rate": 4.296264367816092e-05, "loss": 12.7497, "step": 14700 }, { "epoch": 11.340996168582375, "grad_norm": 1.2438215017318726, "learning_rate": 4.291475095785441e-05, "loss": 12.6343, "step": 14800 }, { "epoch": 11.417624521072797, "grad_norm": 1.668867826461792, "learning_rate": 4.2866858237547896e-05, "loss": 12.673, "step": 14900 }, { "epoch": 11.494252873563218, "grad_norm": 2.550316572189331, "learning_rate": 4.281896551724138e-05, "loss": 12.7346, "step": 15000 }, { "epoch": 11.57088122605364, "grad_norm": 1.3926326036453247, "learning_rate": 4.277107279693487e-05, "loss": 12.5431, "step": 15100 }, { "epoch": 11.647509578544062, "grad_norm": 1.3561134338378906, "learning_rate": 4.272318007662836e-05, "loss": 12.4943, "step": 15200 }, { "epoch": 11.724137931034482, "grad_norm": 1.4978444576263428, "learning_rate": 4.2675287356321844e-05, "loss": 12.4103, "step": 15300 }, { "epoch": 11.800766283524904, "grad_norm": 1.8163210153579712, "learning_rate": 4.262739463601533e-05, "loss": 12.5454, "step": 15400 }, { "epoch": 11.877394636015326, "grad_norm": 1.3819987773895264, "learning_rate": 4.257950191570881e-05, "loss": 12.5219, "step": 15500 }, { "epoch": 11.954022988505747, "grad_norm": 1.6237196922302246, "learning_rate": 4.25316091954023e-05, "loss": 12.5876, "step": 15600 }, { "epoch": 12.0, "eval_loss": 13.39963436126709, "eval_runtime": 44.002, "eval_samples_per_second": 29.658, "eval_steps_per_second": 3.727, "step": 15660 }, { "epoch": 12.030651340996169, "grad_norm": 1.1271090507507324, "learning_rate": 4.2483716475095784e-05, "loss": 12.3581, "step": 15700 }, { "epoch": 12.10727969348659, "grad_norm": 1.5027310848236084, "learning_rate": 4.243582375478927e-05, "loss": 12.5517, "step": 15800 }, { "epoch": 12.183908045977011, "grad_norm": 1.5543391704559326, "learning_rate": 4.238793103448276e-05, "loss": 12.7011, "step": 15900 }, { "epoch": 12.260536398467433, "grad_norm": 1.7037404775619507, "learning_rate": 4.2340038314176245e-05, "loss": 12.289, "step": 16000 }, { "epoch": 12.337164750957854, "grad_norm": 4.505245208740234, "learning_rate": 4.229214559386974e-05, "loss": 12.3584, "step": 16100 }, { "epoch": 12.413793103448276, "grad_norm": 1.5144113302230835, "learning_rate": 4.2244252873563225e-05, "loss": 12.4209, "step": 16200 }, { "epoch": 12.490421455938698, "grad_norm": 1.2396819591522217, "learning_rate": 4.2196360153256706e-05, "loss": 12.4463, "step": 16300 }, { "epoch": 12.567049808429118, "grad_norm": 5.947683334350586, "learning_rate": 4.214846743295019e-05, "loss": 12.6401, "step": 16400 }, { "epoch": 12.64367816091954, "grad_norm": 2.070812225341797, "learning_rate": 4.210057471264368e-05, "loss": 12.6885, "step": 16500 }, { "epoch": 12.720306513409962, "grad_norm": 1.7540252208709717, "learning_rate": 4.2052681992337166e-05, "loss": 12.3138, "step": 16600 }, { "epoch": 12.796934865900383, "grad_norm": 1.3372827768325806, "learning_rate": 4.200478927203065e-05, "loss": 12.8475, "step": 16700 }, { "epoch": 12.873563218390805, "grad_norm": 1.6598443984985352, "learning_rate": 4.195689655172414e-05, "loss": 12.575, "step": 16800 }, { "epoch": 12.950191570881227, "grad_norm": 1.5420461893081665, "learning_rate": 4.190900383141763e-05, "loss": 12.499, "step": 16900 }, { "epoch": 13.0, "eval_loss": 13.359596252441406, "eval_runtime": 43.9919, "eval_samples_per_second": 29.665, "eval_steps_per_second": 3.728, "step": 16965 }, { "epoch": 13.026819923371647, "grad_norm": 1.785803198814392, "learning_rate": 4.1861111111111114e-05, "loss": 12.3123, "step": 17000 }, { "epoch": 13.10344827586207, "grad_norm": 3.8619072437286377, "learning_rate": 4.1813697318007665e-05, "loss": 12.4633, "step": 17100 }, { "epoch": 13.18007662835249, "grad_norm": 1.2189018726348877, "learning_rate": 4.176580459770115e-05, "loss": 12.4732, "step": 17200 }, { "epoch": 13.256704980842912, "grad_norm": 3.579725742340088, "learning_rate": 4.171791187739464e-05, "loss": 12.3486, "step": 17300 }, { "epoch": 13.333333333333334, "grad_norm": 1.258268117904663, "learning_rate": 4.1670019157088126e-05, "loss": 12.5506, "step": 17400 }, { "epoch": 13.409961685823754, "grad_norm": 1.6867891550064087, "learning_rate": 4.162212643678161e-05, "loss": 12.5667, "step": 17500 }, { "epoch": 13.486590038314176, "grad_norm": 1.5345897674560547, "learning_rate": 4.15742337164751e-05, "loss": 12.5206, "step": 17600 }, { "epoch": 13.563218390804598, "grad_norm": 1.1699010133743286, "learning_rate": 4.152634099616859e-05, "loss": 12.3728, "step": 17700 }, { "epoch": 13.639846743295019, "grad_norm": 1.669938325881958, "learning_rate": 4.147844827586207e-05, "loss": 12.4601, "step": 17800 }, { "epoch": 13.71647509578544, "grad_norm": 1.2530852556228638, "learning_rate": 4.1430555555555554e-05, "loss": 12.4501, "step": 17900 }, { "epoch": 13.793103448275861, "grad_norm": 1.790138840675354, "learning_rate": 4.138266283524904e-05, "loss": 12.467, "step": 18000 }, { "epoch": 13.869731800766283, "grad_norm": 1.3373574018478394, "learning_rate": 4.133477011494253e-05, "loss": 12.4602, "step": 18100 }, { "epoch": 13.946360153256705, "grad_norm": 1.837951898574829, "learning_rate": 4.128687739463602e-05, "loss": 12.4591, "step": 18200 }, { "epoch": 14.0, "eval_loss": 13.289255142211914, "eval_runtime": 43.9866, "eval_samples_per_second": 29.668, "eval_steps_per_second": 3.728, "step": 18270 }, { "epoch": 14.022988505747126, "grad_norm": 1.540867805480957, "learning_rate": 4.123898467432951e-05, "loss": 12.59, "step": 18300 }, { "epoch": 14.099616858237548, "grad_norm": 1.6285018920898438, "learning_rate": 4.1191091954022995e-05, "loss": 12.5162, "step": 18400 }, { "epoch": 14.17624521072797, "grad_norm": 0.8983919620513916, "learning_rate": 4.114319923371648e-05, "loss": 12.4312, "step": 18500 }, { "epoch": 14.25287356321839, "grad_norm": 1.7475948333740234, "learning_rate": 4.109530651340996e-05, "loss": 12.483, "step": 18600 }, { "epoch": 14.329501915708812, "grad_norm": 1.723708987236023, "learning_rate": 4.104741379310345e-05, "loss": 12.5177, "step": 18700 }, { "epoch": 14.406130268199234, "grad_norm": 1.3113809823989868, "learning_rate": 4.0999521072796936e-05, "loss": 12.3171, "step": 18800 }, { "epoch": 14.482758620689655, "grad_norm": 1.7641185522079468, "learning_rate": 4.095162835249042e-05, "loss": 12.4669, "step": 18900 }, { "epoch": 14.559386973180077, "grad_norm": 1.6181635856628418, "learning_rate": 4.090373563218391e-05, "loss": 12.3302, "step": 19000 }, { "epoch": 14.636015325670499, "grad_norm": 1.2323795557022095, "learning_rate": 4.0855842911877396e-05, "loss": 12.4211, "step": 19100 }, { "epoch": 14.71264367816092, "grad_norm": 1.7597166299819946, "learning_rate": 4.080795019157088e-05, "loss": 12.4985, "step": 19200 }, { "epoch": 14.789272030651341, "grad_norm": 1.0281277894973755, "learning_rate": 4.076005747126437e-05, "loss": 12.5672, "step": 19300 }, { "epoch": 14.865900383141762, "grad_norm": 3.3272478580474854, "learning_rate": 4.071216475095786e-05, "loss": 12.2671, "step": 19400 }, { "epoch": 14.942528735632184, "grad_norm": 3.1264896392822266, "learning_rate": 4.066427203065134e-05, "loss": 12.4736, "step": 19500 }, { "epoch": 15.0, "eval_loss": 13.205364227294922, "eval_runtime": 43.9612, "eval_samples_per_second": 29.685, "eval_steps_per_second": 3.731, "step": 19575 }, { "epoch": 15.019157088122606, "grad_norm": 1.568294882774353, "learning_rate": 4.061637931034483e-05, "loss": 12.4604, "step": 19600 }, { "epoch": 15.095785440613026, "grad_norm": 1.919912576675415, "learning_rate": 4.056848659003832e-05, "loss": 12.3773, "step": 19700 }, { "epoch": 15.172413793103448, "grad_norm": 1.5357537269592285, "learning_rate": 4.0520593869731804e-05, "loss": 12.3406, "step": 19800 }, { "epoch": 15.24904214559387, "grad_norm": 1.7306512594223022, "learning_rate": 4.0473180076628356e-05, "loss": 12.4036, "step": 19900 }, { "epoch": 15.32567049808429, "grad_norm": 1.6036773920059204, "learning_rate": 4.0425287356321836e-05, "loss": 12.3554, "step": 20000 }, { "epoch": 15.402298850574713, "grad_norm": 1.211962342262268, "learning_rate": 4.037739463601532e-05, "loss": 12.5084, "step": 20100 }, { "epoch": 15.478927203065133, "grad_norm": 1.4626506567001343, "learning_rate": 4.032950191570881e-05, "loss": 12.3593, "step": 20200 }, { "epoch": 15.555555555555555, "grad_norm": 1.6557157039642334, "learning_rate": 4.0281609195402304e-05, "loss": 12.3249, "step": 20300 }, { "epoch": 15.632183908045977, "grad_norm": 1.735300064086914, "learning_rate": 4.023371647509579e-05, "loss": 12.2958, "step": 20400 }, { "epoch": 15.708812260536398, "grad_norm": 1.2972387075424194, "learning_rate": 4.018582375478928e-05, "loss": 12.4011, "step": 20500 }, { "epoch": 15.78544061302682, "grad_norm": 1.2028956413269043, "learning_rate": 4.0137931034482764e-05, "loss": 12.3923, "step": 20600 }, { "epoch": 15.862068965517242, "grad_norm": 1.9574451446533203, "learning_rate": 4.009003831417625e-05, "loss": 12.4927, "step": 20700 }, { "epoch": 15.938697318007662, "grad_norm": 2.3753159046173096, "learning_rate": 4.004214559386973e-05, "loss": 12.4565, "step": 20800 }, { "epoch": 16.0, "eval_loss": 13.146517753601074, "eval_runtime": 43.956, "eval_samples_per_second": 29.689, "eval_steps_per_second": 3.731, "step": 20880 }, { "epoch": 16.015325670498083, "grad_norm": 1.4980436563491821, "learning_rate": 3.999425287356322e-05, "loss": 12.4546, "step": 20900 }, { "epoch": 16.091954022988507, "grad_norm": 1.2177377939224243, "learning_rate": 3.9946360153256705e-05, "loss": 12.3682, "step": 21000 }, { "epoch": 16.168582375478927, "grad_norm": 1.9785245656967163, "learning_rate": 3.989846743295019e-05, "loss": 12.4315, "step": 21100 }, { "epoch": 16.245210727969347, "grad_norm": 2.2773125171661377, "learning_rate": 3.985057471264368e-05, "loss": 12.4728, "step": 21200 }, { "epoch": 16.32183908045977, "grad_norm": 1.1049697399139404, "learning_rate": 3.9802681992337166e-05, "loss": 12.0735, "step": 21300 }, { "epoch": 16.39846743295019, "grad_norm": 2.937175750732422, "learning_rate": 3.975478927203065e-05, "loss": 12.4713, "step": 21400 }, { "epoch": 16.47509578544061, "grad_norm": 1.058626651763916, "learning_rate": 3.970689655172414e-05, "loss": 12.3329, "step": 21500 }, { "epoch": 16.551724137931036, "grad_norm": 2.357311248779297, "learning_rate": 3.9659003831417626e-05, "loss": 12.2249, "step": 21600 }, { "epoch": 16.628352490421456, "grad_norm": 1.0534141063690186, "learning_rate": 3.961111111111111e-05, "loss": 12.4414, "step": 21700 }, { "epoch": 16.704980842911876, "grad_norm": 1.5288047790527344, "learning_rate": 3.95632183908046e-05, "loss": 12.0682, "step": 21800 }, { "epoch": 16.7816091954023, "grad_norm": 2.628070831298828, "learning_rate": 3.951532567049809e-05, "loss": 12.367, "step": 21900 }, { "epoch": 16.85823754789272, "grad_norm": 1.4049383401870728, "learning_rate": 3.9467432950191574e-05, "loss": 12.1073, "step": 22000 }, { "epoch": 16.93486590038314, "grad_norm": 1.8470909595489502, "learning_rate": 3.941954022988506e-05, "loss": 12.3757, "step": 22100 }, { "epoch": 17.0, "eval_loss": 13.134416580200195, "eval_runtime": 44.0763, "eval_samples_per_second": 29.608, "eval_steps_per_second": 3.721, "step": 22185 }, { "epoch": 17.011494252873565, "grad_norm": 1.1388458013534546, "learning_rate": 3.937164750957855e-05, "loss": 12.6443, "step": 22200 }, { "epoch": 17.088122605363985, "grad_norm": 1.202028512954712, "learning_rate": 3.9323754789272034e-05, "loss": 12.3013, "step": 22300 }, { "epoch": 17.164750957854405, "grad_norm": 1.210375189781189, "learning_rate": 3.927586206896552e-05, "loss": 12.4812, "step": 22400 }, { "epoch": 17.24137931034483, "grad_norm": 1.6550730466842651, "learning_rate": 3.922796934865901e-05, "loss": 12.3152, "step": 22500 }, { "epoch": 17.31800766283525, "grad_norm": 1.5777093172073364, "learning_rate": 3.918007662835249e-05, "loss": 12.2296, "step": 22600 }, { "epoch": 17.39463601532567, "grad_norm": 7.877992153167725, "learning_rate": 3.9132183908045975e-05, "loss": 12.4408, "step": 22700 }, { "epoch": 17.47126436781609, "grad_norm": 1.6760473251342773, "learning_rate": 3.908429118773946e-05, "loss": 12.251, "step": 22800 }, { "epoch": 17.547892720306514, "grad_norm": 2.4793410301208496, "learning_rate": 3.903639846743295e-05, "loss": 12.3864, "step": 22900 }, { "epoch": 17.624521072796934, "grad_norm": 1.331120491027832, "learning_rate": 3.8988505747126436e-05, "loss": 12.0078, "step": 23000 }, { "epoch": 17.701149425287355, "grad_norm": 1.1477069854736328, "learning_rate": 3.894109195402299e-05, "loss": 12.2234, "step": 23100 }, { "epoch": 17.77777777777778, "grad_norm": 1.5665520429611206, "learning_rate": 3.8893199233716474e-05, "loss": 12.2716, "step": 23200 }, { "epoch": 17.8544061302682, "grad_norm": 1.4720168113708496, "learning_rate": 3.884530651340996e-05, "loss": 12.2528, "step": 23300 }, { "epoch": 17.93103448275862, "grad_norm": 1.4990317821502686, "learning_rate": 3.879741379310345e-05, "loss": 12.4111, "step": 23400 }, { "epoch": 18.0, "eval_loss": 13.10958194732666, "eval_runtime": 43.9884, "eval_samples_per_second": 29.667, "eval_steps_per_second": 3.728, "step": 23490 }, { "epoch": 18.007662835249043, "grad_norm": 1.653239130973816, "learning_rate": 3.8749521072796935e-05, "loss": 12.4558, "step": 23500 }, { "epoch": 18.084291187739463, "grad_norm": 1.3574182987213135, "learning_rate": 3.870162835249042e-05, "loss": 12.3242, "step": 23600 }, { "epoch": 18.160919540229884, "grad_norm": 2.0138070583343506, "learning_rate": 3.865373563218391e-05, "loss": 12.2255, "step": 23700 }, { "epoch": 18.237547892720308, "grad_norm": 1.6546958684921265, "learning_rate": 3.8605842911877396e-05, "loss": 12.3826, "step": 23800 }, { "epoch": 18.314176245210728, "grad_norm": 1.304247498512268, "learning_rate": 3.855795019157088e-05, "loss": 12.1766, "step": 23900 }, { "epoch": 18.39080459770115, "grad_norm": 1.109941005706787, "learning_rate": 3.851005747126437e-05, "loss": 12.3784, "step": 24000 }, { "epoch": 18.467432950191572, "grad_norm": 4.5435872077941895, "learning_rate": 3.8462164750957856e-05, "loss": 12.2292, "step": 24100 }, { "epoch": 18.544061302681992, "grad_norm": 2.141022205352783, "learning_rate": 3.841427203065134e-05, "loss": 12.2826, "step": 24200 }, { "epoch": 18.620689655172413, "grad_norm": 1.6946494579315186, "learning_rate": 3.836637931034483e-05, "loss": 12.3012, "step": 24300 }, { "epoch": 18.697318007662837, "grad_norm": 1.3159388303756714, "learning_rate": 3.831848659003832e-05, "loss": 12.1835, "step": 24400 }, { "epoch": 18.773946360153257, "grad_norm": 2.499986410140991, "learning_rate": 3.8270593869731804e-05, "loss": 12.4302, "step": 24500 }, { "epoch": 18.850574712643677, "grad_norm": 1.7443987131118774, "learning_rate": 3.822270114942529e-05, "loss": 12.5402, "step": 24600 }, { "epoch": 18.9272030651341, "grad_norm": 1.4758720397949219, "learning_rate": 3.817480842911878e-05, "loss": 12.3978, "step": 24700 }, { "epoch": 19.0, "eval_loss": 13.101744651794434, "eval_runtime": 43.9919, "eval_samples_per_second": 29.665, "eval_steps_per_second": 3.728, "step": 24795 }, { "epoch": 19.00383141762452, "grad_norm": 1.774843454360962, "learning_rate": 3.812691570881226e-05, "loss": 12.2954, "step": 24800 }, { "epoch": 19.080459770114942, "grad_norm": 1.693176031112671, "learning_rate": 3.8079022988505745e-05, "loss": 12.3156, "step": 24900 }, { "epoch": 19.157088122605366, "grad_norm": 1.3531700372695923, "learning_rate": 3.803113026819923e-05, "loss": 12.3989, "step": 25000 }, { "epoch": 19.233716475095786, "grad_norm": 2.083587884902954, "learning_rate": 3.798323754789272e-05, "loss": 12.3523, "step": 25100 }, { "epoch": 19.310344827586206, "grad_norm": 2.1645917892456055, "learning_rate": 3.793534482758621e-05, "loss": 12.0512, "step": 25200 }, { "epoch": 19.386973180076627, "grad_norm": 1.8869907855987549, "learning_rate": 3.78874521072797e-05, "loss": 12.4837, "step": 25300 }, { "epoch": 19.46360153256705, "grad_norm": 1.2421497106552124, "learning_rate": 3.7840038314176244e-05, "loss": 11.9937, "step": 25400 }, { "epoch": 19.54022988505747, "grad_norm": 1.5155110359191895, "learning_rate": 3.779214559386973e-05, "loss": 12.2264, "step": 25500 }, { "epoch": 19.61685823754789, "grad_norm": 1.1511332988739014, "learning_rate": 3.774425287356322e-05, "loss": 12.2063, "step": 25600 }, { "epoch": 19.693486590038315, "grad_norm": 1.8984183073043823, "learning_rate": 3.7696360153256705e-05, "loss": 12.3237, "step": 25700 }, { "epoch": 19.770114942528735, "grad_norm": 0.9674005508422852, "learning_rate": 3.764846743295019e-05, "loss": 12.1877, "step": 25800 }, { "epoch": 19.846743295019156, "grad_norm": 2.0560641288757324, "learning_rate": 3.7600574712643685e-05, "loss": 12.2343, "step": 25900 }, { "epoch": 19.92337164750958, "grad_norm": 1.3923600912094116, "learning_rate": 3.755268199233717e-05, "loss": 12.2683, "step": 26000 }, { "epoch": 20.0, "grad_norm": 2.9314024448394775, "learning_rate": 3.750478927203065e-05, "loss": 12.3074, "step": 26100 }, { "epoch": 20.0, "eval_loss": 13.07620906829834, "eval_runtime": 43.9934, "eval_samples_per_second": 29.664, "eval_steps_per_second": 3.728, "step": 26100 }, { "epoch": 20.07662835249042, "grad_norm": 1.5305142402648926, "learning_rate": 3.745689655172414e-05, "loss": 12.2615, "step": 26200 }, { "epoch": 20.153256704980844, "grad_norm": 1.3846060037612915, "learning_rate": 3.7409003831417626e-05, "loss": 12.3109, "step": 26300 }, { "epoch": 20.229885057471265, "grad_norm": 3.0465173721313477, "learning_rate": 3.736111111111111e-05, "loss": 12.258, "step": 26400 }, { "epoch": 20.306513409961685, "grad_norm": 3.9723782539367676, "learning_rate": 3.73132183908046e-05, "loss": 12.2494, "step": 26500 }, { "epoch": 20.38314176245211, "grad_norm": 1.464296817779541, "learning_rate": 3.7265325670498086e-05, "loss": 12.2231, "step": 26600 }, { "epoch": 20.45977011494253, "grad_norm": 1.6789374351501465, "learning_rate": 3.721743295019157e-05, "loss": 12.3391, "step": 26700 }, { "epoch": 20.53639846743295, "grad_norm": 1.1731619834899902, "learning_rate": 3.716954022988506e-05, "loss": 12.2172, "step": 26800 }, { "epoch": 20.613026819923373, "grad_norm": 2.8839802742004395, "learning_rate": 3.712164750957855e-05, "loss": 12.251, "step": 26900 }, { "epoch": 20.689655172413794, "grad_norm": 1.3104863166809082, "learning_rate": 3.707375478927203e-05, "loss": 12.4269, "step": 27000 }, { "epoch": 20.766283524904214, "grad_norm": 2.5182230472564697, "learning_rate": 3.7025862068965514e-05, "loss": 12.1972, "step": 27100 }, { "epoch": 20.842911877394634, "grad_norm": 1.4510316848754883, "learning_rate": 3.6977969348659e-05, "loss": 12.1446, "step": 27200 }, { "epoch": 20.919540229885058, "grad_norm": 1.7377287149429321, "learning_rate": 3.6930076628352495e-05, "loss": 12.2374, "step": 27300 }, { "epoch": 20.99616858237548, "grad_norm": 1.308686375617981, "learning_rate": 3.6882662835249046e-05, "loss": 12.2169, "step": 27400 }, { "epoch": 21.0, "eval_loss": 13.027502059936523, "eval_runtime": 44.025, "eval_samples_per_second": 29.642, "eval_steps_per_second": 3.725, "step": 27405 }, { "epoch": 21.0727969348659, "grad_norm": 1.7697923183441162, "learning_rate": 3.6834770114942526e-05, "loss": 12.3711, "step": 27500 }, { "epoch": 21.149425287356323, "grad_norm": 1.2963312864303589, "learning_rate": 3.678687739463601e-05, "loss": 12.1974, "step": 27600 }, { "epoch": 21.226053639846743, "grad_norm": 1.617470383644104, "learning_rate": 3.67389846743295e-05, "loss": 12.1879, "step": 27700 }, { "epoch": 21.302681992337163, "grad_norm": 2.007051944732666, "learning_rate": 3.669109195402299e-05, "loss": 12.2758, "step": 27800 }, { "epoch": 21.379310344827587, "grad_norm": 1.4421669244766235, "learning_rate": 3.6643199233716474e-05, "loss": 12.1852, "step": 27900 }, { "epoch": 21.455938697318008, "grad_norm": 2.678457260131836, "learning_rate": 3.659530651340997e-05, "loss": 12.3418, "step": 28000 }, { "epoch": 21.532567049808428, "grad_norm": 1.4007712602615356, "learning_rate": 3.6547413793103455e-05, "loss": 12.4764, "step": 28100 }, { "epoch": 21.60919540229885, "grad_norm": 4.606558322906494, "learning_rate": 3.649952107279694e-05, "loss": 12.2566, "step": 28200 }, { "epoch": 21.685823754789272, "grad_norm": 1.354705810546875, "learning_rate": 3.645162835249042e-05, "loss": 12.2371, "step": 28300 }, { "epoch": 21.762452107279692, "grad_norm": 1.7736151218414307, "learning_rate": 3.640373563218391e-05, "loss": 12.4794, "step": 28400 }, { "epoch": 21.839080459770116, "grad_norm": 1.2875999212265015, "learning_rate": 3.6355842911877395e-05, "loss": 12.0016, "step": 28500 }, { "epoch": 21.915708812260537, "grad_norm": 1.932035207748413, "learning_rate": 3.630795019157088e-05, "loss": 12.3018, "step": 28600 }, { "epoch": 21.992337164750957, "grad_norm": 3.066443920135498, "learning_rate": 3.626005747126437e-05, "loss": 12.0117, "step": 28700 }, { "epoch": 22.0, "eval_loss": 13.03292179107666, "eval_runtime": 44.0005, "eval_samples_per_second": 29.659, "eval_steps_per_second": 3.727, "step": 28710 }, { "epoch": 22.06896551724138, "grad_norm": 0.97423255443573, "learning_rate": 3.6212164750957856e-05, "loss": 12.4442, "step": 28800 }, { "epoch": 22.1455938697318, "grad_norm": 1.7552623748779297, "learning_rate": 3.616427203065134e-05, "loss": 12.2976, "step": 28900 }, { "epoch": 22.22222222222222, "grad_norm": 1.5857703685760498, "learning_rate": 3.611637931034483e-05, "loss": 12.1968, "step": 29000 }, { "epoch": 22.298850574712645, "grad_norm": 1.381238341331482, "learning_rate": 3.6068486590038317e-05, "loss": 12.0455, "step": 29100 }, { "epoch": 22.375478927203066, "grad_norm": 1.3380298614501953, "learning_rate": 3.6020593869731803e-05, "loss": 12.1833, "step": 29200 }, { "epoch": 22.452107279693486, "grad_norm": 2.3591909408569336, "learning_rate": 3.5972701149425284e-05, "loss": 12.1562, "step": 29300 }, { "epoch": 22.52873563218391, "grad_norm": 2.544651508331299, "learning_rate": 3.592528735632184e-05, "loss": 12.1318, "step": 29400 }, { "epoch": 22.60536398467433, "grad_norm": 1.204476237297058, "learning_rate": 3.587739463601533e-05, "loss": 12.3856, "step": 29500 }, { "epoch": 22.68199233716475, "grad_norm": 1.453444004058838, "learning_rate": 3.5829501915708816e-05, "loss": 12.0971, "step": 29600 }, { "epoch": 22.75862068965517, "grad_norm": 2.287437915802002, "learning_rate": 3.5781609195402296e-05, "loss": 12.1294, "step": 29700 }, { "epoch": 22.835249042145595, "grad_norm": 2.790942907333374, "learning_rate": 3.573371647509578e-05, "loss": 12.1613, "step": 29800 }, { "epoch": 22.911877394636015, "grad_norm": 1.6170670986175537, "learning_rate": 3.568582375478927e-05, "loss": 12.0175, "step": 29900 }, { "epoch": 22.988505747126435, "grad_norm": 1.724195122718811, "learning_rate": 3.5637931034482757e-05, "loss": 12.1815, "step": 30000 }, { "epoch": 23.0, "eval_loss": 12.992958068847656, "eval_runtime": 44.0141, "eval_samples_per_second": 29.65, "eval_steps_per_second": 3.726, "step": 30015 }, { "epoch": 23.06513409961686, "grad_norm": 3.8932502269744873, "learning_rate": 3.559003831417625e-05, "loss": 12.1987, "step": 30100 }, { "epoch": 23.14176245210728, "grad_norm": 1.8813198804855347, "learning_rate": 3.554214559386974e-05, "loss": 12.2208, "step": 30200 }, { "epoch": 23.2183908045977, "grad_norm": 1.0299080610275269, "learning_rate": 3.5494252873563224e-05, "loss": 12.1662, "step": 30300 }, { "epoch": 23.295019157088124, "grad_norm": 2.68420672416687, "learning_rate": 3.544636015325671e-05, "loss": 12.1013, "step": 30400 }, { "epoch": 23.371647509578544, "grad_norm": 0.9587434530258179, "learning_rate": 3.539846743295019e-05, "loss": 12.3426, "step": 30500 }, { "epoch": 23.448275862068964, "grad_norm": 1.8168953657150269, "learning_rate": 3.535057471264368e-05, "loss": 12.2303, "step": 30600 }, { "epoch": 23.52490421455939, "grad_norm": 1.2712435722351074, "learning_rate": 3.5302681992337165e-05, "loss": 12.275, "step": 30700 }, { "epoch": 23.60153256704981, "grad_norm": 1.0442867279052734, "learning_rate": 3.525478927203065e-05, "loss": 12.1344, "step": 30800 }, { "epoch": 23.67816091954023, "grad_norm": 2.2171154022216797, "learning_rate": 3.520689655172414e-05, "loss": 12.1554, "step": 30900 }, { "epoch": 23.754789272030653, "grad_norm": 1.5863583087921143, "learning_rate": 3.5159003831417625e-05, "loss": 12.1003, "step": 31000 }, { "epoch": 23.831417624521073, "grad_norm": 1.4239143133163452, "learning_rate": 3.511111111111111e-05, "loss": 12.1271, "step": 31100 }, { "epoch": 23.908045977011493, "grad_norm": 2.044018030166626, "learning_rate": 3.50632183908046e-05, "loss": 12.3269, "step": 31200 }, { "epoch": 23.984674329501917, "grad_norm": 2.9049460887908936, "learning_rate": 3.5015325670498086e-05, "loss": 12.0403, "step": 31300 }, { "epoch": 24.0, "eval_loss": 13.009976387023926, "eval_runtime": 44.0062, "eval_samples_per_second": 29.655, "eval_steps_per_second": 3.727, "step": 31320 }, { "epoch": 24.061302681992338, "grad_norm": 1.4207292795181274, "learning_rate": 3.496743295019157e-05, "loss": 12.0634, "step": 31400 }, { "epoch": 24.137931034482758, "grad_norm": 1.886399269104004, "learning_rate": 3.491954022988506e-05, "loss": 12.1573, "step": 31500 }, { "epoch": 24.21455938697318, "grad_norm": 2.239217519760132, "learning_rate": 3.487164750957855e-05, "loss": 12.3025, "step": 31600 }, { "epoch": 24.291187739463602, "grad_norm": 1.495377540588379, "learning_rate": 3.4823754789272034e-05, "loss": 12.1236, "step": 31700 }, { "epoch": 24.367816091954023, "grad_norm": 1.4570187330245972, "learning_rate": 3.477586206896552e-05, "loss": 12.1341, "step": 31800 }, { "epoch": 24.444444444444443, "grad_norm": 1.137839674949646, "learning_rate": 3.472796934865901e-05, "loss": 12.1097, "step": 31900 }, { "epoch": 24.521072796934867, "grad_norm": 1.9981390237808228, "learning_rate": 3.4680076628352494e-05, "loss": 12.4374, "step": 32000 }, { "epoch": 24.597701149425287, "grad_norm": 1.6802810430526733, "learning_rate": 3.463218390804598e-05, "loss": 12.0851, "step": 32100 }, { "epoch": 24.674329501915707, "grad_norm": 2.0081875324249268, "learning_rate": 3.458429118773947e-05, "loss": 12.0883, "step": 32200 }, { "epoch": 24.75095785440613, "grad_norm": 2.637779474258423, "learning_rate": 3.453639846743295e-05, "loss": 12.198, "step": 32300 }, { "epoch": 24.82758620689655, "grad_norm": 6.473161220550537, "learning_rate": 3.4488505747126435e-05, "loss": 12.1459, "step": 32400 }, { "epoch": 24.904214559386972, "grad_norm": 1.3531584739685059, "learning_rate": 3.444061302681992e-05, "loss": 12.0297, "step": 32500 }, { "epoch": 24.980842911877396, "grad_norm": 1.2492320537567139, "learning_rate": 3.439272030651341e-05, "loss": 12.0907, "step": 32600 }, { "epoch": 25.0, "eval_loss": 12.98237419128418, "eval_runtime": 44.0055, "eval_samples_per_second": 29.655, "eval_steps_per_second": 3.727, "step": 32625 }, { "epoch": 25.057471264367816, "grad_norm": 1.2564047574996948, "learning_rate": 3.4344827586206896e-05, "loss": 12.3271, "step": 32700 }, { "epoch": 25.134099616858236, "grad_norm": 1.6601101160049438, "learning_rate": 3.429741379310345e-05, "loss": 12.2568, "step": 32800 }, { "epoch": 25.21072796934866, "grad_norm": 1.8177669048309326, "learning_rate": 3.4249521072796934e-05, "loss": 12.2059, "step": 32900 }, { "epoch": 25.28735632183908, "grad_norm": 1.5476176738739014, "learning_rate": 3.420162835249042e-05, "loss": 12.2871, "step": 33000 }, { "epoch": 25.3639846743295, "grad_norm": 1.305198073387146, "learning_rate": 3.415373563218391e-05, "loss": 12.258, "step": 33100 }, { "epoch": 25.440613026819925, "grad_norm": 5.837198257446289, "learning_rate": 3.4105842911877395e-05, "loss": 12.0855, "step": 33200 }, { "epoch": 25.517241379310345, "grad_norm": 2.148789882659912, "learning_rate": 3.405795019157088e-05, "loss": 12.1539, "step": 33300 }, { "epoch": 25.593869731800766, "grad_norm": 1.8985601663589478, "learning_rate": 3.401005747126437e-05, "loss": 12.2977, "step": 33400 }, { "epoch": 25.67049808429119, "grad_norm": 1.9121934175491333, "learning_rate": 3.3962164750957855e-05, "loss": 12.0616, "step": 33500 }, { "epoch": 25.74712643678161, "grad_norm": 1.3972700834274292, "learning_rate": 3.391427203065134e-05, "loss": 12.0951, "step": 33600 }, { "epoch": 25.82375478927203, "grad_norm": 1.3285768032073975, "learning_rate": 3.386637931034483e-05, "loss": 12.0531, "step": 33700 }, { "epoch": 25.900383141762454, "grad_norm": 2.199030876159668, "learning_rate": 3.3818486590038316e-05, "loss": 11.9635, "step": 33800 }, { "epoch": 25.977011494252874, "grad_norm": 1.0486905574798584, "learning_rate": 3.37705938697318e-05, "loss": 11.9477, "step": 33900 }, { "epoch": 26.0, "eval_loss": 12.954750061035156, "eval_runtime": 44.0151, "eval_samples_per_second": 29.649, "eval_steps_per_second": 3.726, "step": 33930 }, { "epoch": 26.053639846743295, "grad_norm": 1.8525198698043823, "learning_rate": 3.372270114942529e-05, "loss": 11.9857, "step": 34000 }, { "epoch": 26.130268199233715, "grad_norm": 1.4454785585403442, "learning_rate": 3.367480842911878e-05, "loss": 11.8142, "step": 34100 }, { "epoch": 26.20689655172414, "grad_norm": 1.6828280687332153, "learning_rate": 3.3626915708812264e-05, "loss": 11.9359, "step": 34200 }, { "epoch": 26.28352490421456, "grad_norm": 1.898542046546936, "learning_rate": 3.357902298850575e-05, "loss": 12.3808, "step": 34300 }, { "epoch": 26.36015325670498, "grad_norm": 1.3259601593017578, "learning_rate": 3.353113026819924e-05, "loss": 11.9188, "step": 34400 }, { "epoch": 26.436781609195403, "grad_norm": 1.2543106079101562, "learning_rate": 3.348323754789272e-05, "loss": 12.2622, "step": 34500 }, { "epoch": 26.513409961685824, "grad_norm": 1.1741349697113037, "learning_rate": 3.3435344827586204e-05, "loss": 12.3296, "step": 34600 }, { "epoch": 26.590038314176244, "grad_norm": 2.937052011489868, "learning_rate": 3.338745210727969e-05, "loss": 12.0383, "step": 34700 }, { "epoch": 26.666666666666668, "grad_norm": 1.5736559629440308, "learning_rate": 3.333955938697318e-05, "loss": 12.178, "step": 34800 }, { "epoch": 26.743295019157088, "grad_norm": 1.9110735654830933, "learning_rate": 3.329214559386974e-05, "loss": 12.223, "step": 34900 }, { "epoch": 26.81992337164751, "grad_norm": 0.9110540747642517, "learning_rate": 3.324425287356322e-05, "loss": 12.1191, "step": 35000 }, { "epoch": 26.896551724137932, "grad_norm": 1.3772426843643188, "learning_rate": 3.3196360153256704e-05, "loss": 12.1527, "step": 35100 }, { "epoch": 26.973180076628353, "grad_norm": 1.5747685432434082, "learning_rate": 3.314846743295019e-05, "loss": 12.093, "step": 35200 }, { "epoch": 27.0, "eval_loss": 12.915553092956543, "eval_runtime": 44.0197, "eval_samples_per_second": 29.646, "eval_steps_per_second": 3.726, "step": 35235 }, { "epoch": 27.049808429118773, "grad_norm": 1.285940408706665, "learning_rate": 3.310057471264368e-05, "loss": 12.1302, "step": 35300 }, { "epoch": 27.126436781609197, "grad_norm": 1.3924872875213623, "learning_rate": 3.3052681992337164e-05, "loss": 12.2251, "step": 35400 }, { "epoch": 27.203065134099617, "grad_norm": 3.2285568714141846, "learning_rate": 3.300478927203065e-05, "loss": 12.1551, "step": 35500 }, { "epoch": 27.279693486590038, "grad_norm": 1.9970892667770386, "learning_rate": 3.295689655172414e-05, "loss": 12.1276, "step": 35600 }, { "epoch": 27.35632183908046, "grad_norm": 1.5273020267486572, "learning_rate": 3.290900383141763e-05, "loss": 12.3051, "step": 35700 }, { "epoch": 27.43295019157088, "grad_norm": 1.3356541395187378, "learning_rate": 3.286111111111111e-05, "loss": 12.1591, "step": 35800 }, { "epoch": 27.509578544061302, "grad_norm": 1.1603785753250122, "learning_rate": 3.28132183908046e-05, "loss": 11.9451, "step": 35900 }, { "epoch": 27.586206896551722, "grad_norm": 1.2263092994689941, "learning_rate": 3.2765325670498086e-05, "loss": 12.069, "step": 36000 }, { "epoch": 27.662835249042146, "grad_norm": 2.639704465866089, "learning_rate": 3.271743295019157e-05, "loss": 12.0213, "step": 36100 }, { "epoch": 27.739463601532567, "grad_norm": 1.1907585859298706, "learning_rate": 3.266954022988506e-05, "loss": 12.0336, "step": 36200 }, { "epoch": 27.816091954022987, "grad_norm": 2.5226128101348877, "learning_rate": 3.2621647509578546e-05, "loss": 12.1515, "step": 36300 }, { "epoch": 27.89272030651341, "grad_norm": 1.263527274131775, "learning_rate": 3.257375478927203e-05, "loss": 12.067, "step": 36400 }, { "epoch": 27.96934865900383, "grad_norm": 1.636793613433838, "learning_rate": 3.252586206896552e-05, "loss": 12.14, "step": 36500 }, { "epoch": 28.0, "eval_loss": 12.91286563873291, "eval_runtime": 44.033, "eval_samples_per_second": 29.637, "eval_steps_per_second": 3.724, "step": 36540 }, { "epoch": 28.04597701149425, "grad_norm": 1.691573977470398, "learning_rate": 3.247796934865901e-05, "loss": 12.0472, "step": 36600 }, { "epoch": 28.122605363984675, "grad_norm": 2.2020788192749023, "learning_rate": 3.2430076628352494e-05, "loss": 12.0171, "step": 36700 }, { "epoch": 28.199233716475096, "grad_norm": 1.9675192832946777, "learning_rate": 3.2382183908045974e-05, "loss": 12.1335, "step": 36800 }, { "epoch": 28.275862068965516, "grad_norm": 2.210883378982544, "learning_rate": 3.233429118773946e-05, "loss": 12.065, "step": 36900 }, { "epoch": 28.35249042145594, "grad_norm": 1.4574834108352661, "learning_rate": 3.2286398467432954e-05, "loss": 12.0635, "step": 37000 }, { "epoch": 28.42911877394636, "grad_norm": 2.1000685691833496, "learning_rate": 3.223850574712644e-05, "loss": 12.2908, "step": 37100 }, { "epoch": 28.50574712643678, "grad_norm": 2.088956832885742, "learning_rate": 3.2191091954022986e-05, "loss": 12.2421, "step": 37200 }, { "epoch": 28.582375478927204, "grad_norm": 1.5785751342773438, "learning_rate": 3.214319923371647e-05, "loss": 12.0568, "step": 37300 }, { "epoch": 28.659003831417625, "grad_norm": 1.5230878591537476, "learning_rate": 3.209530651340996e-05, "loss": 12.0995, "step": 37400 }, { "epoch": 28.735632183908045, "grad_norm": 1.1175010204315186, "learning_rate": 3.204741379310345e-05, "loss": 12.17, "step": 37500 }, { "epoch": 28.81226053639847, "grad_norm": 1.6524131298065186, "learning_rate": 3.1999521072796934e-05, "loss": 12.1192, "step": 37600 }, { "epoch": 28.88888888888889, "grad_norm": 1.5143946409225464, "learning_rate": 3.195162835249042e-05, "loss": 11.9995, "step": 37700 }, { "epoch": 28.96551724137931, "grad_norm": 1.2787953615188599, "learning_rate": 3.1903735632183914e-05, "loss": 12.0876, "step": 37800 }, { "epoch": 29.0, "eval_loss": 12.9454984664917, "eval_runtime": 44.0594, "eval_samples_per_second": 29.619, "eval_steps_per_second": 3.722, "step": 37845 }, { "epoch": 29.042145593869733, "grad_norm": 1.4434622526168823, "learning_rate": 3.18558429118774e-05, "loss": 11.8509, "step": 37900 }, { "epoch": 29.118773946360154, "grad_norm": 1.2989375591278076, "learning_rate": 3.180795019157088e-05, "loss": 12.1473, "step": 38000 }, { "epoch": 29.195402298850574, "grad_norm": 1.6747602224349976, "learning_rate": 3.176005747126437e-05, "loss": 12.1781, "step": 38100 }, { "epoch": 29.272030651340994, "grad_norm": 2.2328062057495117, "learning_rate": 3.1712164750957855e-05, "loss": 12.2881, "step": 38200 }, { "epoch": 29.34865900383142, "grad_norm": 2.3226537704467773, "learning_rate": 3.166427203065134e-05, "loss": 12.0132, "step": 38300 }, { "epoch": 29.42528735632184, "grad_norm": 1.7786709070205688, "learning_rate": 3.161637931034483e-05, "loss": 12.2086, "step": 38400 }, { "epoch": 29.50191570881226, "grad_norm": 2.359247922897339, "learning_rate": 3.1568486590038316e-05, "loss": 12.3037, "step": 38500 }, { "epoch": 29.578544061302683, "grad_norm": 1.661720633506775, "learning_rate": 3.15205938697318e-05, "loss": 11.9945, "step": 38600 }, { "epoch": 29.655172413793103, "grad_norm": 1.2464226484298706, "learning_rate": 3.147270114942529e-05, "loss": 12.0475, "step": 38700 }, { "epoch": 29.731800766283524, "grad_norm": 5.234483242034912, "learning_rate": 3.1424808429118776e-05, "loss": 12.1442, "step": 38800 }, { "epoch": 29.808429118773947, "grad_norm": 1.2800259590148926, "learning_rate": 3.137691570881226e-05, "loss": 11.923, "step": 38900 }, { "epoch": 29.885057471264368, "grad_norm": 1.3353965282440186, "learning_rate": 3.132902298850574e-05, "loss": 12.0991, "step": 39000 }, { "epoch": 29.961685823754788, "grad_norm": 1.974084734916687, "learning_rate": 3.128113026819924e-05, "loss": 12.0987, "step": 39100 }, { "epoch": 30.0, "eval_loss": 12.926346778869629, "eval_runtime": 44.1327, "eval_samples_per_second": 29.57, "eval_steps_per_second": 3.716, "step": 39150 }, { "epoch": 30.038314176245212, "grad_norm": 2.184515953063965, "learning_rate": 3.1233237547892724e-05, "loss": 11.9969, "step": 39200 }, { "epoch": 30.114942528735632, "grad_norm": 3.448138952255249, "learning_rate": 3.1185823754789276e-05, "loss": 12.2465, "step": 39300 }, { "epoch": 30.191570881226053, "grad_norm": 1.5382182598114014, "learning_rate": 3.113793103448276e-05, "loss": 12.1218, "step": 39400 }, { "epoch": 30.268199233716476, "grad_norm": 1.4232020378112793, "learning_rate": 3.109003831417624e-05, "loss": 12.0744, "step": 39500 }, { "epoch": 30.344827586206897, "grad_norm": 1.130115270614624, "learning_rate": 3.104214559386973e-05, "loss": 11.982, "step": 39600 }, { "epoch": 30.421455938697317, "grad_norm": 0.9410238265991211, "learning_rate": 3.0994252873563216e-05, "loss": 11.9721, "step": 39700 }, { "epoch": 30.49808429118774, "grad_norm": 1.6789051294326782, "learning_rate": 3.09463601532567e-05, "loss": 12.2021, "step": 39800 }, { "epoch": 30.57471264367816, "grad_norm": 1.7361513376235962, "learning_rate": 3.08984674329502e-05, "loss": 12.1236, "step": 39900 }, { "epoch": 30.65134099616858, "grad_norm": 1.868490219116211, "learning_rate": 3.0850574712643684e-05, "loss": 12.0632, "step": 40000 }, { "epoch": 30.727969348659006, "grad_norm": 1.3586502075195312, "learning_rate": 3.080268199233717e-05, "loss": 12.0715, "step": 40100 }, { "epoch": 30.804597701149426, "grad_norm": 1.6496648788452148, "learning_rate": 3.075478927203066e-05, "loss": 12.0989, "step": 40200 }, { "epoch": 30.881226053639846, "grad_norm": 1.8671578168869019, "learning_rate": 3.070689655172414e-05, "loss": 11.996, "step": 40300 }, { "epoch": 30.957854406130267, "grad_norm": 0.9875293374061584, "learning_rate": 3.0659003831417624e-05, "loss": 12.0908, "step": 40400 }, { "epoch": 31.0, "eval_loss": 12.88086986541748, "eval_runtime": 44.1375, "eval_samples_per_second": 29.567, "eval_steps_per_second": 3.716, "step": 40455 }, { "epoch": 31.03448275862069, "grad_norm": 4.194854259490967, "learning_rate": 3.061111111111111e-05, "loss": 12.0422, "step": 40500 }, { "epoch": 31.11111111111111, "grad_norm": 1.550528883934021, "learning_rate": 3.05632183908046e-05, "loss": 12.2051, "step": 40600 }, { "epoch": 31.18773946360153, "grad_norm": 2.011462450027466, "learning_rate": 3.0515325670498085e-05, "loss": 12.1084, "step": 40700 }, { "epoch": 31.264367816091955, "grad_norm": 1.100541114807129, "learning_rate": 3.0467432950191572e-05, "loss": 11.9174, "step": 40800 }, { "epoch": 31.340996168582375, "grad_norm": 1.1993151903152466, "learning_rate": 3.041954022988506e-05, "loss": 12.0801, "step": 40900 }, { "epoch": 31.417624521072796, "grad_norm": 1.501018762588501, "learning_rate": 3.0371647509578542e-05, "loss": 12.1011, "step": 41000 }, { "epoch": 31.49425287356322, "grad_norm": 1.788327932357788, "learning_rate": 3.032375478927203e-05, "loss": 12.192, "step": 41100 }, { "epoch": 31.57088122605364, "grad_norm": 1.7562750577926636, "learning_rate": 3.0275862068965523e-05, "loss": 11.829, "step": 41200 }, { "epoch": 31.64750957854406, "grad_norm": 1.467976450920105, "learning_rate": 3.0227969348659006e-05, "loss": 12.0685, "step": 41300 }, { "epoch": 31.724137931034484, "grad_norm": 2.4010770320892334, "learning_rate": 3.0180076628352493e-05, "loss": 12.0806, "step": 41400 }, { "epoch": 31.800766283524904, "grad_norm": 1.759490728378296, "learning_rate": 3.013218390804598e-05, "loss": 12.1422, "step": 41500 }, { "epoch": 31.877394636015325, "grad_norm": 1.6164530515670776, "learning_rate": 3.0084291187739467e-05, "loss": 12.0766, "step": 41600 }, { "epoch": 31.95402298850575, "grad_norm": 1.3001078367233276, "learning_rate": 3.0036398467432954e-05, "loss": 12.0244, "step": 41700 }, { "epoch": 32.0, "eval_loss": 12.876104354858398, "eval_runtime": 44.1527, "eval_samples_per_second": 29.557, "eval_steps_per_second": 3.714, "step": 41760 }, { "epoch": 32.030651340996165, "grad_norm": 1.1984444856643677, "learning_rate": 2.9988505747126437e-05, "loss": 12.1453, "step": 41800 }, { "epoch": 32.10727969348659, "grad_norm": 0.9655357599258423, "learning_rate": 2.9941091954022986e-05, "loss": 11.8735, "step": 41900 }, { "epoch": 32.18390804597701, "grad_norm": 1.0667262077331543, "learning_rate": 2.989319923371648e-05, "loss": 12.1566, "step": 42000 }, { "epoch": 32.26053639846743, "grad_norm": 1.6131408214569092, "learning_rate": 2.9845306513409966e-05, "loss": 11.9729, "step": 42100 }, { "epoch": 32.337164750957854, "grad_norm": 1.6158314943313599, "learning_rate": 2.979741379310345e-05, "loss": 12.0362, "step": 42200 }, { "epoch": 32.41379310344828, "grad_norm": 1.189818263053894, "learning_rate": 2.9749521072796937e-05, "loss": 12.2135, "step": 42300 }, { "epoch": 32.490421455938694, "grad_norm": 2.628614664077759, "learning_rate": 2.9701628352490423e-05, "loss": 12.032, "step": 42400 }, { "epoch": 32.56704980842912, "grad_norm": 1.6809107065200806, "learning_rate": 2.965373563218391e-05, "loss": 11.81, "step": 42500 }, { "epoch": 32.64367816091954, "grad_norm": 1.6311430931091309, "learning_rate": 2.9605842911877397e-05, "loss": 11.9348, "step": 42600 }, { "epoch": 32.72030651340996, "grad_norm": 1.2387199401855469, "learning_rate": 2.955795019157088e-05, "loss": 12.0694, "step": 42700 }, { "epoch": 32.79693486590038, "grad_norm": 1.7171186208724976, "learning_rate": 2.9510057471264368e-05, "loss": 11.9729, "step": 42800 }, { "epoch": 32.87356321839081, "grad_norm": 1.6134984493255615, "learning_rate": 2.9462164750957854e-05, "loss": 12.1292, "step": 42900 }, { "epoch": 32.95019157088122, "grad_norm": 2.2401788234710693, "learning_rate": 2.941427203065134e-05, "loss": 12.1613, "step": 43000 }, { "epoch": 33.0, "eval_loss": 12.873848915100098, "eval_runtime": 44.126, "eval_samples_per_second": 29.574, "eval_steps_per_second": 3.717, "step": 43065 }, { "epoch": 33.02681992337165, "grad_norm": 1.260538935661316, "learning_rate": 2.9366379310344828e-05, "loss": 12.1855, "step": 43100 }, { "epoch": 33.10344827586207, "grad_norm": 1.7840496301651, "learning_rate": 2.9318486590038312e-05, "loss": 12.0618, "step": 43200 }, { "epoch": 33.18007662835249, "grad_norm": 1.162712574005127, "learning_rate": 2.92705938697318e-05, "loss": 12.2513, "step": 43300 }, { "epoch": 33.25670498084291, "grad_norm": 3.618567705154419, "learning_rate": 2.9222701149425292e-05, "loss": 12.0614, "step": 43400 }, { "epoch": 33.333333333333336, "grad_norm": 1.2605602741241455, "learning_rate": 2.9174808429118776e-05, "loss": 11.9763, "step": 43500 }, { "epoch": 33.40996168582375, "grad_norm": 1.4304360151290894, "learning_rate": 2.9126915708812263e-05, "loss": 12.1044, "step": 43600 }, { "epoch": 33.486590038314176, "grad_norm": 1.1767237186431885, "learning_rate": 2.907902298850575e-05, "loss": 11.8996, "step": 43700 }, { "epoch": 33.5632183908046, "grad_norm": 1.6173638105392456, "learning_rate": 2.9031130268199236e-05, "loss": 11.969, "step": 43800 }, { "epoch": 33.63984674329502, "grad_norm": 1.2231945991516113, "learning_rate": 2.8983237547892723e-05, "loss": 12.2301, "step": 43900 }, { "epoch": 33.71647509578544, "grad_norm": 3.853048801422119, "learning_rate": 2.8935344827586207e-05, "loss": 11.9726, "step": 44000 }, { "epoch": 33.793103448275865, "grad_norm": 1.4259275197982788, "learning_rate": 2.8887452107279694e-05, "loss": 11.9545, "step": 44100 }, { "epoch": 33.86973180076628, "grad_norm": 2.5803606510162354, "learning_rate": 2.883955938697318e-05, "loss": 11.8867, "step": 44200 }, { "epoch": 33.946360153256705, "grad_norm": 1.3688091039657593, "learning_rate": 2.8791666666666667e-05, "loss": 12.0033, "step": 44300 }, { "epoch": 34.0, "eval_loss": 12.871088027954102, "eval_runtime": 44.1202, "eval_samples_per_second": 29.578, "eval_steps_per_second": 3.717, "step": 44370 }, { "epoch": 34.02298850574713, "grad_norm": 1.947970986366272, "learning_rate": 2.8743773946360154e-05, "loss": 11.9572, "step": 44400 }, { "epoch": 34.099616858237546, "grad_norm": 1.9568095207214355, "learning_rate": 2.8696360153256706e-05, "loss": 12.0624, "step": 44500 }, { "epoch": 34.17624521072797, "grad_norm": 1.4037648439407349, "learning_rate": 2.8648467432950193e-05, "loss": 11.8426, "step": 44600 }, { "epoch": 34.252873563218394, "grad_norm": 2.5989620685577393, "learning_rate": 2.860057471264368e-05, "loss": 11.9217, "step": 44700 }, { "epoch": 34.32950191570881, "grad_norm": 1.3627197742462158, "learning_rate": 2.8552681992337167e-05, "loss": 11.9418, "step": 44800 }, { "epoch": 34.406130268199234, "grad_norm": 1.4087576866149902, "learning_rate": 2.8504789272030654e-05, "loss": 12.1608, "step": 44900 }, { "epoch": 34.48275862068966, "grad_norm": 1.4856873750686646, "learning_rate": 2.8456896551724137e-05, "loss": 11.9778, "step": 45000 }, { "epoch": 34.559386973180075, "grad_norm": 1.631663203239441, "learning_rate": 2.8409003831417624e-05, "loss": 12.0547, "step": 45100 }, { "epoch": 34.6360153256705, "grad_norm": 2.1117138862609863, "learning_rate": 2.836111111111111e-05, "loss": 12.0824, "step": 45200 }, { "epoch": 34.71264367816092, "grad_norm": 1.9915541410446167, "learning_rate": 2.8313218390804598e-05, "loss": 12.0984, "step": 45300 }, { "epoch": 34.78927203065134, "grad_norm": 2.4851934909820557, "learning_rate": 2.8265325670498085e-05, "loss": 12.0646, "step": 45400 }, { "epoch": 34.86590038314176, "grad_norm": 1.1414411067962646, "learning_rate": 2.8217432950191575e-05, "loss": 12.0986, "step": 45500 }, { "epoch": 34.94252873563218, "grad_norm": 1.0578815937042236, "learning_rate": 2.8169540229885062e-05, "loss": 12.1035, "step": 45600 }, { "epoch": 35.0, "eval_loss": 12.84704875946045, "eval_runtime": 44.1331, "eval_samples_per_second": 29.57, "eval_steps_per_second": 3.716, "step": 45675 }, { "epoch": 35.019157088122604, "grad_norm": 1.2231003046035767, "learning_rate": 2.812164750957855e-05, "loss": 12.2043, "step": 45700 }, { "epoch": 35.09578544061303, "grad_norm": 1.6044613122940063, "learning_rate": 2.8073754789272032e-05, "loss": 11.9987, "step": 45800 }, { "epoch": 35.172413793103445, "grad_norm": 1.208008050918579, "learning_rate": 2.802586206896552e-05, "loss": 11.7725, "step": 45900 }, { "epoch": 35.24904214559387, "grad_norm": 1.8152436017990112, "learning_rate": 2.7977969348659006e-05, "loss": 11.9232, "step": 46000 }, { "epoch": 35.32567049808429, "grad_norm": 0.9535597562789917, "learning_rate": 2.7930076628352493e-05, "loss": 12.2091, "step": 46100 }, { "epoch": 35.40229885057471, "grad_norm": 1.5778999328613281, "learning_rate": 2.7882183908045976e-05, "loss": 12.0968, "step": 46200 }, { "epoch": 35.47892720306513, "grad_norm": 1.5384963750839233, "learning_rate": 2.7834291187739463e-05, "loss": 12.1058, "step": 46300 }, { "epoch": 35.55555555555556, "grad_norm": 1.1971815824508667, "learning_rate": 2.778639846743295e-05, "loss": 12.048, "step": 46400 }, { "epoch": 35.632183908045974, "grad_norm": 1.2047299146652222, "learning_rate": 2.7738505747126437e-05, "loss": 12.0413, "step": 46500 }, { "epoch": 35.7088122605364, "grad_norm": 1.6629399061203003, "learning_rate": 2.7690613026819924e-05, "loss": 11.9562, "step": 46600 }, { "epoch": 35.78544061302682, "grad_norm": 1.8731905221939087, "learning_rate": 2.7642720306513407e-05, "loss": 12.0334, "step": 46700 }, { "epoch": 35.86206896551724, "grad_norm": 1.5753523111343384, "learning_rate": 2.75948275862069e-05, "loss": 11.9348, "step": 46800 }, { "epoch": 35.93869731800766, "grad_norm": 2.0848851203918457, "learning_rate": 2.7546934865900388e-05, "loss": 12.0199, "step": 46900 }, { "epoch": 36.0, "eval_loss": 12.837443351745605, "eval_runtime": 44.1529, "eval_samples_per_second": 29.556, "eval_steps_per_second": 3.714, "step": 46980 }, { "epoch": 36.015325670498086, "grad_norm": 1.3191312551498413, "learning_rate": 2.749904214559387e-05, "loss": 12.1034, "step": 47000 }, { "epoch": 36.0919540229885, "grad_norm": 1.8107291460037231, "learning_rate": 2.7451149425287358e-05, "loss": 11.9679, "step": 47100 }, { "epoch": 36.16858237547893, "grad_norm": 2.29463529586792, "learning_rate": 2.7403735632183906e-05, "loss": 11.7111, "step": 47200 }, { "epoch": 36.24521072796935, "grad_norm": 1.3297805786132812, "learning_rate": 2.7355842911877393e-05, "loss": 11.8913, "step": 47300 }, { "epoch": 36.32183908045977, "grad_norm": 1.1663862466812134, "learning_rate": 2.730795019157088e-05, "loss": 12.0487, "step": 47400 }, { "epoch": 36.39846743295019, "grad_norm": 1.4846138954162598, "learning_rate": 2.7260057471264367e-05, "loss": 12.1661, "step": 47500 }, { "epoch": 36.475095785440615, "grad_norm": 1.8800255060195923, "learning_rate": 2.7212164750957857e-05, "loss": 11.9248, "step": 47600 }, { "epoch": 36.55172413793103, "grad_norm": 1.7427587509155273, "learning_rate": 2.7164272030651344e-05, "loss": 12.0681, "step": 47700 }, { "epoch": 36.628352490421456, "grad_norm": 2.0017685890197754, "learning_rate": 2.711637931034483e-05, "loss": 12.2556, "step": 47800 }, { "epoch": 36.70498084291188, "grad_norm": 2.765782117843628, "learning_rate": 2.7068486590038318e-05, "loss": 11.8846, "step": 47900 }, { "epoch": 36.7816091954023, "grad_norm": 1.519728422164917, "learning_rate": 2.70205938697318e-05, "loss": 12.0119, "step": 48000 }, { "epoch": 36.85823754789272, "grad_norm": 1.091073989868164, "learning_rate": 2.697270114942529e-05, "loss": 12.1197, "step": 48100 }, { "epoch": 36.934865900383144, "grad_norm": 1.3182342052459717, "learning_rate": 2.6924808429118775e-05, "loss": 12.0217, "step": 48200 }, { "epoch": 37.0, "eval_loss": 12.849996566772461, "eval_runtime": 44.1316, "eval_samples_per_second": 29.571, "eval_steps_per_second": 3.716, "step": 48285 }, { "epoch": 37.01149425287356, "grad_norm": 1.9082536697387695, "learning_rate": 2.6876915708812262e-05, "loss": 12.2391, "step": 48300 }, { "epoch": 37.088122605363985, "grad_norm": 1.5705393552780151, "learning_rate": 2.682902298850575e-05, "loss": 12.1329, "step": 48400 }, { "epoch": 37.16475095785441, "grad_norm": 2.2240869998931885, "learning_rate": 2.6781130268199233e-05, "loss": 12.108, "step": 48500 }, { "epoch": 37.241379310344826, "grad_norm": 1.357383370399475, "learning_rate": 2.673323754789272e-05, "loss": 11.9599, "step": 48600 }, { "epoch": 37.31800766283525, "grad_norm": 2.1634521484375, "learning_rate": 2.6685344827586206e-05, "loss": 12.0339, "step": 48700 }, { "epoch": 37.39463601532567, "grad_norm": 1.611195683479309, "learning_rate": 2.6637452107279693e-05, "loss": 12.0276, "step": 48800 }, { "epoch": 37.47126436781609, "grad_norm": 1.3676810264587402, "learning_rate": 2.6589559386973183e-05, "loss": 11.9487, "step": 48900 }, { "epoch": 37.547892720306514, "grad_norm": 1.4503991603851318, "learning_rate": 2.654166666666667e-05, "loss": 11.9166, "step": 49000 }, { "epoch": 37.62452107279694, "grad_norm": 2.0941789150238037, "learning_rate": 2.6493773946360157e-05, "loss": 12.0909, "step": 49100 }, { "epoch": 37.701149425287355, "grad_norm": 1.4591392278671265, "learning_rate": 2.6445881226053644e-05, "loss": 11.9453, "step": 49200 }, { "epoch": 37.77777777777778, "grad_norm": 1.3402618169784546, "learning_rate": 2.6397988505747128e-05, "loss": 11.9431, "step": 49300 }, { "epoch": 37.8544061302682, "grad_norm": 1.697449803352356, "learning_rate": 2.6350095785440614e-05, "loss": 11.8129, "step": 49400 }, { "epoch": 37.93103448275862, "grad_norm": 1.5764317512512207, "learning_rate": 2.63022030651341e-05, "loss": 11.975, "step": 49500 }, { "epoch": 38.0, "eval_loss": 12.832439422607422, "eval_runtime": 44.0844, "eval_samples_per_second": 29.602, "eval_steps_per_second": 3.72, "step": 49590 }, { "epoch": 38.00766283524904, "grad_norm": 3.7600104808807373, "learning_rate": 2.6254310344827588e-05, "loss": 12.1701, "step": 49600 }, { "epoch": 38.08429118773947, "grad_norm": 1.9188120365142822, "learning_rate": 2.6206417624521075e-05, "loss": 12.0672, "step": 49700 }, { "epoch": 38.160919540229884, "grad_norm": 1.5679752826690674, "learning_rate": 2.615852490421456e-05, "loss": 11.9374, "step": 49800 }, { "epoch": 38.23754789272031, "grad_norm": 1.6603142023086548, "learning_rate": 2.6110632183908045e-05, "loss": 11.8708, "step": 49900 }, { "epoch": 38.31417624521073, "grad_norm": 2.0302236080169678, "learning_rate": 2.6062739463601532e-05, "loss": 12.0997, "step": 50000 }, { "epoch": 38.39080459770115, "grad_norm": 1.4646397829055786, "learning_rate": 2.601484674329502e-05, "loss": 12.1337, "step": 50100 }, { "epoch": 38.46743295019157, "grad_norm": 2.1434216499328613, "learning_rate": 2.5966954022988506e-05, "loss": 12.063, "step": 50200 }, { "epoch": 38.54406130268199, "grad_norm": 1.4451220035552979, "learning_rate": 2.5919061302681996e-05, "loss": 11.8743, "step": 50300 }, { "epoch": 38.62068965517241, "grad_norm": 1.4875038862228394, "learning_rate": 2.5871168582375483e-05, "loss": 12.1545, "step": 50400 }, { "epoch": 38.69731800766284, "grad_norm": 2.4424338340759277, "learning_rate": 2.582327586206897e-05, "loss": 11.9573, "step": 50500 }, { "epoch": 38.77394636015325, "grad_norm": 1.0890432596206665, "learning_rate": 2.5775383141762454e-05, "loss": 11.894, "step": 50600 }, { "epoch": 38.85057471264368, "grad_norm": 1.410107970237732, "learning_rate": 2.572749042145594e-05, "loss": 12.0408, "step": 50700 }, { "epoch": 38.9272030651341, "grad_norm": 1.1632236242294312, "learning_rate": 2.5679597701149427e-05, "loss": 12.0218, "step": 50800 }, { "epoch": 39.0, "eval_loss": 12.819197654724121, "eval_runtime": 44.0917, "eval_samples_per_second": 29.597, "eval_steps_per_second": 3.72, "step": 50895 }, { "epoch": 39.00383141762452, "grad_norm": 1.8346548080444336, "learning_rate": 2.5631704980842914e-05, "loss": 11.9914, "step": 50900 }, { "epoch": 39.08045977011494, "grad_norm": 1.3156729936599731, "learning_rate": 2.55838122605364e-05, "loss": 11.882, "step": 51000 }, { "epoch": 39.157088122605366, "grad_norm": 1.464136004447937, "learning_rate": 2.5535919540229885e-05, "loss": 12.0324, "step": 51100 }, { "epoch": 39.23371647509578, "grad_norm": 1.40706205368042, "learning_rate": 2.548802681992337e-05, "loss": 12.0355, "step": 51200 }, { "epoch": 39.310344827586206, "grad_norm": 1.1469753980636597, "learning_rate": 2.544013409961686e-05, "loss": 11.8437, "step": 51300 }, { "epoch": 39.38697318007663, "grad_norm": 2.110839605331421, "learning_rate": 2.5392241379310345e-05, "loss": 12.0156, "step": 51400 }, { "epoch": 39.46360153256705, "grad_norm": 1.0058891773223877, "learning_rate": 2.534434865900383e-05, "loss": 12.093, "step": 51500 }, { "epoch": 39.54022988505747, "grad_norm": 1.7903035879135132, "learning_rate": 2.5296455938697316e-05, "loss": 12.1111, "step": 51600 }, { "epoch": 39.616858237547895, "grad_norm": 1.7223442792892456, "learning_rate": 2.524856321839081e-05, "loss": 11.8909, "step": 51700 }, { "epoch": 39.69348659003831, "grad_norm": 1.6216609477996826, "learning_rate": 2.5200670498084293e-05, "loss": 12.0638, "step": 51800 }, { "epoch": 39.770114942528735, "grad_norm": 2.2488083839416504, "learning_rate": 2.515277777777778e-05, "loss": 12.193, "step": 51900 }, { "epoch": 39.84674329501916, "grad_norm": 1.9876821041107178, "learning_rate": 2.5104885057471267e-05, "loss": 11.9594, "step": 52000 }, { "epoch": 39.923371647509576, "grad_norm": 2.0479111671447754, "learning_rate": 2.5056992337164753e-05, "loss": 11.8695, "step": 52100 }, { "epoch": 40.0, "grad_norm": 2.512753486633301, "learning_rate": 2.500909961685824e-05, "loss": 11.9546, "step": 52200 }, { "epoch": 40.0, "eval_loss": 12.806585311889648, "eval_runtime": 44.0741, "eval_samples_per_second": 29.609, "eval_steps_per_second": 3.721, "step": 52200 }, { "epoch": 40.076628352490424, "grad_norm": 1.4184033870697021, "learning_rate": 2.4961206896551724e-05, "loss": 11.9875, "step": 52300 }, { "epoch": 40.15325670498084, "grad_norm": 2.1215152740478516, "learning_rate": 2.491331417624521e-05, "loss": 11.8898, "step": 52400 }, { "epoch": 40.229885057471265, "grad_norm": 1.5458124876022339, "learning_rate": 2.4865421455938698e-05, "loss": 12.2281, "step": 52500 }, { "epoch": 40.30651340996169, "grad_norm": 1.336580753326416, "learning_rate": 2.4817528735632184e-05, "loss": 11.743, "step": 52600 }, { "epoch": 40.383141762452105, "grad_norm": 1.1983288526535034, "learning_rate": 2.476963601532567e-05, "loss": 12.0526, "step": 52700 }, { "epoch": 40.45977011494253, "grad_norm": 3.6479368209838867, "learning_rate": 2.4721743295019158e-05, "loss": 11.9597, "step": 52800 }, { "epoch": 40.53639846743295, "grad_norm": 2.154127359390259, "learning_rate": 2.467432950191571e-05, "loss": 11.9651, "step": 52900 }, { "epoch": 40.61302681992337, "grad_norm": 1.476364016532898, "learning_rate": 2.4626436781609197e-05, "loss": 11.8092, "step": 53000 }, { "epoch": 40.689655172413794, "grad_norm": 1.9797921180725098, "learning_rate": 2.4578544061302684e-05, "loss": 12.1406, "step": 53100 }, { "epoch": 40.76628352490422, "grad_norm": 1.5220038890838623, "learning_rate": 2.453065134099617e-05, "loss": 11.8779, "step": 53200 }, { "epoch": 40.842911877394634, "grad_norm": 1.1830068826675415, "learning_rate": 2.4482758620689654e-05, "loss": 12.0007, "step": 53300 }, { "epoch": 40.91954022988506, "grad_norm": 1.3260859251022339, "learning_rate": 2.4434865900383144e-05, "loss": 12.1607, "step": 53400 }, { "epoch": 40.99616858237548, "grad_norm": 1.8781402111053467, "learning_rate": 2.438697318007663e-05, "loss": 11.9159, "step": 53500 }, { "epoch": 41.0, "eval_loss": 12.82541275024414, "eval_runtime": 44.0679, "eval_samples_per_second": 29.613, "eval_steps_per_second": 3.722, "step": 53505 }, { "epoch": 41.0727969348659, "grad_norm": 3.089315891265869, "learning_rate": 2.4339080459770118e-05, "loss": 12.0552, "step": 53600 }, { "epoch": 41.14942528735632, "grad_norm": 1.9572243690490723, "learning_rate": 2.42911877394636e-05, "loss": 12.0124, "step": 53700 }, { "epoch": 41.22605363984675, "grad_norm": 1.6215753555297852, "learning_rate": 2.424329501915709e-05, "loss": 11.9782, "step": 53800 }, { "epoch": 41.30268199233716, "grad_norm": 1.3075189590454102, "learning_rate": 2.4195402298850575e-05, "loss": 12.2317, "step": 53900 }, { "epoch": 41.37931034482759, "grad_norm": 1.1214234828948975, "learning_rate": 2.4147509578544062e-05, "loss": 12.1511, "step": 54000 }, { "epoch": 41.45593869731801, "grad_norm": 8.386270523071289, "learning_rate": 2.409961685823755e-05, "loss": 11.8253, "step": 54100 }, { "epoch": 41.53256704980843, "grad_norm": 5.074198246002197, "learning_rate": 2.4051724137931036e-05, "loss": 12.0205, "step": 54200 }, { "epoch": 41.60919540229885, "grad_norm": 1.2190698385238647, "learning_rate": 2.4003831417624523e-05, "loss": 11.9438, "step": 54300 }, { "epoch": 41.68582375478927, "grad_norm": 1.3544102907180786, "learning_rate": 2.395593869731801e-05, "loss": 12.1235, "step": 54400 }, { "epoch": 41.76245210727969, "grad_norm": 1.080891489982605, "learning_rate": 2.3908045977011497e-05, "loss": 11.7676, "step": 54500 }, { "epoch": 41.839080459770116, "grad_norm": 1.453224539756775, "learning_rate": 2.386015325670498e-05, "loss": 12.0158, "step": 54600 }, { "epoch": 41.91570881226053, "grad_norm": 1.3428503274917603, "learning_rate": 2.3812260536398467e-05, "loss": 11.8066, "step": 54700 }, { "epoch": 41.99233716475096, "grad_norm": 1.3496088981628418, "learning_rate": 2.3764367816091957e-05, "loss": 11.8988, "step": 54800 }, { "epoch": 42.0, "eval_loss": 12.804805755615234, "eval_runtime": 44.1053, "eval_samples_per_second": 29.588, "eval_steps_per_second": 3.718, "step": 54810 }, { "epoch": 42.06896551724138, "grad_norm": 1.2151437997817993, "learning_rate": 2.3716475095785444e-05, "loss": 12.1893, "step": 54900 }, { "epoch": 42.1455938697318, "grad_norm": 1.6184425354003906, "learning_rate": 2.3669061302681993e-05, "loss": 12.0546, "step": 55000 }, { "epoch": 42.22222222222222, "grad_norm": 1.6667332649230957, "learning_rate": 2.362116858237548e-05, "loss": 11.7933, "step": 55100 }, { "epoch": 42.298850574712645, "grad_norm": 3.835425615310669, "learning_rate": 2.3573275862068966e-05, "loss": 11.9275, "step": 55200 }, { "epoch": 42.37547892720306, "grad_norm": 4.450900554656982, "learning_rate": 2.3525383141762453e-05, "loss": 12.1853, "step": 55300 }, { "epoch": 42.452107279693486, "grad_norm": 1.4358230829238892, "learning_rate": 2.347749042145594e-05, "loss": 12.0228, "step": 55400 }, { "epoch": 42.52873563218391, "grad_norm": 1.6793595552444458, "learning_rate": 2.3429597701149427e-05, "loss": 11.9595, "step": 55500 }, { "epoch": 42.60536398467433, "grad_norm": 1.305600643157959, "learning_rate": 2.3381704980842914e-05, "loss": 11.8126, "step": 55600 }, { "epoch": 42.68199233716475, "grad_norm": 1.5794193744659424, "learning_rate": 2.33338122605364e-05, "loss": 12.0154, "step": 55700 }, { "epoch": 42.758620689655174, "grad_norm": 1.6401104927062988, "learning_rate": 2.3285919540229888e-05, "loss": 11.8344, "step": 55800 }, { "epoch": 42.83524904214559, "grad_norm": 1.6348859071731567, "learning_rate": 2.323802681992337e-05, "loss": 12.0174, "step": 55900 }, { "epoch": 42.911877394636015, "grad_norm": 2.6531448364257812, "learning_rate": 2.3190134099616858e-05, "loss": 11.8581, "step": 56000 }, { "epoch": 42.98850574712644, "grad_norm": 1.423274040222168, "learning_rate": 2.3142241379310345e-05, "loss": 11.9313, "step": 56100 }, { "epoch": 43.0, "eval_loss": 12.791069030761719, "eval_runtime": 44.1222, "eval_samples_per_second": 29.577, "eval_steps_per_second": 3.717, "step": 56115 }, { "epoch": 43.065134099616856, "grad_norm": 1.3258931636810303, "learning_rate": 2.3094348659003835e-05, "loss": 11.8864, "step": 56200 }, { "epoch": 43.14176245210728, "grad_norm": 1.4615380764007568, "learning_rate": 2.304645593869732e-05, "loss": 12.0657, "step": 56300 }, { "epoch": 43.2183908045977, "grad_norm": 1.4611597061157227, "learning_rate": 2.2998563218390805e-05, "loss": 11.9148, "step": 56400 }, { "epoch": 43.29501915708812, "grad_norm": 1.7766637802124023, "learning_rate": 2.2950670498084292e-05, "loss": 12.0493, "step": 56500 }, { "epoch": 43.371647509578544, "grad_norm": 1.8123854398727417, "learning_rate": 2.290277777777778e-05, "loss": 11.8749, "step": 56600 }, { "epoch": 43.44827586206897, "grad_norm": 2.2500967979431152, "learning_rate": 2.2854885057471266e-05, "loss": 12.0237, "step": 56700 }, { "epoch": 43.524904214559385, "grad_norm": 1.44577157497406, "learning_rate": 2.280699233716475e-05, "loss": 11.8103, "step": 56800 }, { "epoch": 43.60153256704981, "grad_norm": 1.2959234714508057, "learning_rate": 2.275909961685824e-05, "loss": 12.1443, "step": 56900 }, { "epoch": 43.67816091954023, "grad_norm": 1.849253535270691, "learning_rate": 2.2711206896551727e-05, "loss": 12.037, "step": 57000 }, { "epoch": 43.75478927203065, "grad_norm": 1.46470046043396, "learning_rate": 2.266379310344828e-05, "loss": 12.0392, "step": 57100 }, { "epoch": 43.83141762452107, "grad_norm": 1.7397308349609375, "learning_rate": 2.2615900383141765e-05, "loss": 11.8446, "step": 57200 }, { "epoch": 43.9080459770115, "grad_norm": 1.1144057512283325, "learning_rate": 2.256800766283525e-05, "loss": 12.0084, "step": 57300 }, { "epoch": 43.984674329501914, "grad_norm": 4.426650047302246, "learning_rate": 2.2520114942528736e-05, "loss": 12.0514, "step": 57400 }, { "epoch": 44.0, "eval_loss": 12.808330535888672, "eval_runtime": 44.0792, "eval_samples_per_second": 29.606, "eval_steps_per_second": 3.721, "step": 57420 }, { "epoch": 44.06130268199234, "grad_norm": 1.1355741024017334, "learning_rate": 2.2472222222222223e-05, "loss": 11.9243, "step": 57500 }, { "epoch": 44.13793103448276, "grad_norm": 1.5547679662704468, "learning_rate": 2.2424329501915713e-05, "loss": 12.0711, "step": 57600 }, { "epoch": 44.21455938697318, "grad_norm": 1.5729808807373047, "learning_rate": 2.2376436781609196e-05, "loss": 11.9867, "step": 57700 }, { "epoch": 44.2911877394636, "grad_norm": 1.2912790775299072, "learning_rate": 2.2328544061302683e-05, "loss": 11.8632, "step": 57800 }, { "epoch": 44.367816091954026, "grad_norm": 1.2545444965362549, "learning_rate": 2.228065134099617e-05, "loss": 12.0665, "step": 57900 }, { "epoch": 44.44444444444444, "grad_norm": 1.3165549039840698, "learning_rate": 2.2232758620689657e-05, "loss": 11.842, "step": 58000 }, { "epoch": 44.52107279693487, "grad_norm": 1.7680951356887817, "learning_rate": 2.218486590038314e-05, "loss": 11.8055, "step": 58100 }, { "epoch": 44.59770114942529, "grad_norm": 2.2426023483276367, "learning_rate": 2.2136973180076627e-05, "loss": 12.1153, "step": 58200 }, { "epoch": 44.67432950191571, "grad_norm": 0.9581509828567505, "learning_rate": 2.2089080459770118e-05, "loss": 11.8089, "step": 58300 }, { "epoch": 44.75095785440613, "grad_norm": 2.1268539428710938, "learning_rate": 2.2041187739463605e-05, "loss": 11.8902, "step": 58400 }, { "epoch": 44.827586206896555, "grad_norm": 1.2000526189804077, "learning_rate": 2.1993295019157088e-05, "loss": 11.8651, "step": 58500 }, { "epoch": 44.90421455938697, "grad_norm": 2.349942684173584, "learning_rate": 2.1945402298850575e-05, "loss": 11.9236, "step": 58600 }, { "epoch": 44.980842911877396, "grad_norm": 1.639948844909668, "learning_rate": 2.1897509578544062e-05, "loss": 11.9533, "step": 58700 }, { "epoch": 45.0, "eval_loss": 12.792840003967285, "eval_runtime": 44.0555, "eval_samples_per_second": 29.622, "eval_steps_per_second": 3.723, "step": 58725 }, { "epoch": 45.05747126436781, "grad_norm": 0.9822871088981628, "learning_rate": 2.184961685823755e-05, "loss": 11.9065, "step": 58800 }, { "epoch": 45.13409961685824, "grad_norm": 5.536319255828857, "learning_rate": 2.1801724137931036e-05, "loss": 11.9411, "step": 58900 }, { "epoch": 45.21072796934866, "grad_norm": 1.8267079591751099, "learning_rate": 2.1753831417624522e-05, "loss": 11.8592, "step": 59000 }, { "epoch": 45.28735632183908, "grad_norm": 1.453710675239563, "learning_rate": 2.170593869731801e-05, "loss": 12.246, "step": 59100 }, { "epoch": 45.3639846743295, "grad_norm": 1.5747921466827393, "learning_rate": 2.1658045977011496e-05, "loss": 12.1555, "step": 59200 }, { "epoch": 45.440613026819925, "grad_norm": 0.9929379224777222, "learning_rate": 2.1610153256704983e-05, "loss": 11.7682, "step": 59300 }, { "epoch": 45.51724137931034, "grad_norm": 1.4931187629699707, "learning_rate": 2.1562260536398467e-05, "loss": 11.8555, "step": 59400 }, { "epoch": 45.593869731800766, "grad_norm": 1.114998459815979, "learning_rate": 2.1514367816091953e-05, "loss": 11.8726, "step": 59500 }, { "epoch": 45.67049808429119, "grad_norm": 1.7308725118637085, "learning_rate": 2.146647509578544e-05, "loss": 12.0875, "step": 59600 }, { "epoch": 45.747126436781606, "grad_norm": 1.1630358695983887, "learning_rate": 2.141858237547893e-05, "loss": 11.8994, "step": 59700 }, { "epoch": 45.82375478927203, "grad_norm": 1.9863486289978027, "learning_rate": 2.1370689655172414e-05, "loss": 11.9502, "step": 59800 }, { "epoch": 45.900383141762454, "grad_norm": 1.3612456321716309, "learning_rate": 2.13227969348659e-05, "loss": 11.8048, "step": 59900 }, { "epoch": 45.97701149425287, "grad_norm": 1.1734110116958618, "learning_rate": 2.1274904214559388e-05, "loss": 12.1155, "step": 60000 }, { "epoch": 46.0, "eval_loss": 12.802705764770508, "eval_runtime": 44.0902, "eval_samples_per_second": 29.598, "eval_steps_per_second": 3.72, "step": 60030 }, { "epoch": 46.053639846743295, "grad_norm": 2.19791841506958, "learning_rate": 2.1227011494252875e-05, "loss": 12.0121, "step": 60100 }, { "epoch": 46.13026819923372, "grad_norm": 3.206514358520508, "learning_rate": 2.1179597701149426e-05, "loss": 11.9131, "step": 60200 }, { "epoch": 46.206896551724135, "grad_norm": 1.2101006507873535, "learning_rate": 2.1131704980842913e-05, "loss": 11.869, "step": 60300 }, { "epoch": 46.28352490421456, "grad_norm": 1.3384582996368408, "learning_rate": 2.10838122605364e-05, "loss": 11.7608, "step": 60400 }, { "epoch": 46.36015325670498, "grad_norm": 3.215064764022827, "learning_rate": 2.1035919540229887e-05, "loss": 12.0495, "step": 60500 }, { "epoch": 46.4367816091954, "grad_norm": 1.26254403591156, "learning_rate": 2.0988026819923374e-05, "loss": 11.8855, "step": 60600 }, { "epoch": 46.513409961685824, "grad_norm": 1.139722466468811, "learning_rate": 2.094013409961686e-05, "loss": 12.0157, "step": 60700 }, { "epoch": 46.59003831417625, "grad_norm": 1.9146323204040527, "learning_rate": 2.0892241379310344e-05, "loss": 11.8276, "step": 60800 }, { "epoch": 46.666666666666664, "grad_norm": 1.6539549827575684, "learning_rate": 2.084434865900383e-05, "loss": 11.9677, "step": 60900 }, { "epoch": 46.74329501915709, "grad_norm": 1.2380534410476685, "learning_rate": 2.0796455938697318e-05, "loss": 12.0291, "step": 61000 }, { "epoch": 46.81992337164751, "grad_norm": 1.8375437259674072, "learning_rate": 2.074856321839081e-05, "loss": 11.9032, "step": 61100 }, { "epoch": 46.89655172413793, "grad_norm": 2.2188262939453125, "learning_rate": 2.0700670498084292e-05, "loss": 12.0465, "step": 61200 }, { "epoch": 46.97318007662835, "grad_norm": 1.1582258939743042, "learning_rate": 2.065277777777778e-05, "loss": 11.924, "step": 61300 }, { "epoch": 47.0, "eval_loss": 12.797731399536133, "eval_runtime": 44.1559, "eval_samples_per_second": 29.554, "eval_steps_per_second": 3.714, "step": 61335 }, { "epoch": 47.04980842911878, "grad_norm": 3.067289352416992, "learning_rate": 2.0604885057471266e-05, "loss": 11.8265, "step": 61400 }, { "epoch": 47.12643678160919, "grad_norm": 1.3472516536712646, "learning_rate": 2.0556992337164752e-05, "loss": 11.8763, "step": 61500 }, { "epoch": 47.20306513409962, "grad_norm": 1.4235740900039673, "learning_rate": 2.050909961685824e-05, "loss": 11.9473, "step": 61600 }, { "epoch": 47.27969348659004, "grad_norm": 1.3170359134674072, "learning_rate": 2.0461206896551723e-05, "loss": 11.9381, "step": 61700 }, { "epoch": 47.35632183908046, "grad_norm": 1.6014246940612793, "learning_rate": 2.0413314176245213e-05, "loss": 11.9074, "step": 61800 }, { "epoch": 47.43295019157088, "grad_norm": 1.3270535469055176, "learning_rate": 2.03654214559387e-05, "loss": 11.9903, "step": 61900 }, { "epoch": 47.509578544061306, "grad_norm": 1.1905503273010254, "learning_rate": 2.0317528735632187e-05, "loss": 11.9629, "step": 62000 }, { "epoch": 47.58620689655172, "grad_norm": 1.546738862991333, "learning_rate": 2.026963601532567e-05, "loss": 11.831, "step": 62100 }, { "epoch": 47.662835249042146, "grad_norm": 1.5887172222137451, "learning_rate": 2.0221743295019157e-05, "loss": 12.0534, "step": 62200 }, { "epoch": 47.73946360153257, "grad_norm": 1.3189942836761475, "learning_rate": 2.0173850574712644e-05, "loss": 11.9131, "step": 62300 }, { "epoch": 47.81609195402299, "grad_norm": 1.9591014385223389, "learning_rate": 2.012595785440613e-05, "loss": 11.8583, "step": 62400 }, { "epoch": 47.89272030651341, "grad_norm": 1.6344765424728394, "learning_rate": 2.0078065134099618e-05, "loss": 11.9921, "step": 62500 }, { "epoch": 47.969348659003835, "grad_norm": 1.1810266971588135, "learning_rate": 2.0030172413793105e-05, "loss": 11.9987, "step": 62600 }, { "epoch": 48.0, "eval_loss": 12.767735481262207, "eval_runtime": 44.144, "eval_samples_per_second": 29.562, "eval_steps_per_second": 3.715, "step": 62640 }, { "epoch": 48.04597701149425, "grad_norm": 1.4370075464248657, "learning_rate": 1.998227969348659e-05, "loss": 12.0014, "step": 62700 }, { "epoch": 48.122605363984675, "grad_norm": 1.2901791334152222, "learning_rate": 1.993438697318008e-05, "loss": 12.0385, "step": 62800 }, { "epoch": 48.1992337164751, "grad_norm": 1.2324562072753906, "learning_rate": 1.9886494252873565e-05, "loss": 11.9594, "step": 62900 }, { "epoch": 48.275862068965516, "grad_norm": 1.40041983127594, "learning_rate": 1.983860153256705e-05, "loss": 11.76, "step": 63000 }, { "epoch": 48.35249042145594, "grad_norm": 1.5981560945510864, "learning_rate": 1.9790708812260536e-05, "loss": 11.8416, "step": 63100 }, { "epoch": 48.42911877394636, "grad_norm": 1.5366255044937134, "learning_rate": 1.974329501915709e-05, "loss": 11.9168, "step": 63200 }, { "epoch": 48.50574712643678, "grad_norm": 2.1091346740722656, "learning_rate": 1.9695402298850578e-05, "loss": 11.7809, "step": 63300 }, { "epoch": 48.582375478927204, "grad_norm": 3.076678991317749, "learning_rate": 1.964750957854406e-05, "loss": 11.8881, "step": 63400 }, { "epoch": 48.65900383141762, "grad_norm": 1.6555073261260986, "learning_rate": 1.9599616858237548e-05, "loss": 11.6799, "step": 63500 }, { "epoch": 48.735632183908045, "grad_norm": 1.2696727514266968, "learning_rate": 1.9551724137931035e-05, "loss": 12.0306, "step": 63600 }, { "epoch": 48.81226053639847, "grad_norm": 1.739827275276184, "learning_rate": 1.9503831417624522e-05, "loss": 12.1005, "step": 63700 }, { "epoch": 48.888888888888886, "grad_norm": 1.187231421470642, "learning_rate": 1.945593869731801e-05, "loss": 11.9703, "step": 63800 }, { "epoch": 48.96551724137931, "grad_norm": 2.756282091140747, "learning_rate": 1.9408045977011496e-05, "loss": 12.0693, "step": 63900 }, { "epoch": 49.0, "eval_loss": 12.775006294250488, "eval_runtime": 44.1249, "eval_samples_per_second": 29.575, "eval_steps_per_second": 3.717, "step": 63945 }, { "epoch": 49.04214559386973, "grad_norm": 0.967854917049408, "learning_rate": 1.9360153256704983e-05, "loss": 11.9437, "step": 64000 }, { "epoch": 49.11877394636015, "grad_norm": 1.2055004835128784, "learning_rate": 1.931226053639847e-05, "loss": 11.9037, "step": 64100 }, { "epoch": 49.195402298850574, "grad_norm": 1.6203746795654297, "learning_rate": 1.9264367816091956e-05, "loss": 11.9823, "step": 64200 }, { "epoch": 49.272030651341, "grad_norm": 1.1399292945861816, "learning_rate": 1.921647509578544e-05, "loss": 12.0721, "step": 64300 }, { "epoch": 49.348659003831415, "grad_norm": 1.3431105613708496, "learning_rate": 1.9168582375478927e-05, "loss": 11.8897, "step": 64400 }, { "epoch": 49.42528735632184, "grad_norm": 1.316723346710205, "learning_rate": 1.9120689655172414e-05, "loss": 11.9025, "step": 64500 }, { "epoch": 49.50191570881226, "grad_norm": 1.8449369668960571, "learning_rate": 1.9072796934865904e-05, "loss": 11.6683, "step": 64600 }, { "epoch": 49.57854406130268, "grad_norm": 1.3772321939468384, "learning_rate": 1.9024904214559387e-05, "loss": 12.2022, "step": 64700 }, { "epoch": 49.6551724137931, "grad_norm": 2.2538058757781982, "learning_rate": 1.8977011494252874e-05, "loss": 11.8425, "step": 64800 }, { "epoch": 49.73180076628353, "grad_norm": 2.1310970783233643, "learning_rate": 1.892911877394636e-05, "loss": 11.9638, "step": 64900 }, { "epoch": 49.808429118773944, "grad_norm": 1.2570499181747437, "learning_rate": 1.8881226053639848e-05, "loss": 12.0367, "step": 65000 }, { "epoch": 49.88505747126437, "grad_norm": 1.6000453233718872, "learning_rate": 1.8833333333333335e-05, "loss": 12.0249, "step": 65100 }, { "epoch": 49.96168582375479, "grad_norm": 1.2556895017623901, "learning_rate": 1.878544061302682e-05, "loss": 11.9285, "step": 65200 }, { "epoch": 50.0, "eval_loss": 12.788679122924805, "eval_runtime": 44.0734, "eval_samples_per_second": 29.61, "eval_steps_per_second": 3.721, "step": 65250 }, { "epoch": 50.03831417624521, "grad_norm": 1.4611543416976929, "learning_rate": 1.873754789272031e-05, "loss": 12.0139, "step": 65300 }, { "epoch": 50.11494252873563, "grad_norm": 1.3939285278320312, "learning_rate": 1.869013409961686e-05, "loss": 12.1466, "step": 65400 }, { "epoch": 50.191570881226056, "grad_norm": 1.378446102142334, "learning_rate": 1.8642241379310347e-05, "loss": 12.0221, "step": 65500 }, { "epoch": 50.26819923371647, "grad_norm": 1.1458476781845093, "learning_rate": 1.859434865900383e-05, "loss": 11.98, "step": 65600 }, { "epoch": 50.3448275862069, "grad_norm": 1.2113792896270752, "learning_rate": 1.8546455938697318e-05, "loss": 11.7938, "step": 65700 }, { "epoch": 50.42145593869732, "grad_norm": 3.7647705078125, "learning_rate": 1.8498563218390804e-05, "loss": 12.046, "step": 65800 }, { "epoch": 50.49808429118774, "grad_norm": 1.4086334705352783, "learning_rate": 1.845067049808429e-05, "loss": 12.0137, "step": 65900 }, { "epoch": 50.57471264367816, "grad_norm": 2.212301254272461, "learning_rate": 1.8402777777777778e-05, "loss": 11.8535, "step": 66000 }, { "epoch": 50.651340996168585, "grad_norm": 1.1334259510040283, "learning_rate": 1.8354885057471265e-05, "loss": 11.7534, "step": 66100 }, { "epoch": 50.727969348659, "grad_norm": 1.3607604503631592, "learning_rate": 1.8306992337164752e-05, "loss": 12.1351, "step": 66200 }, { "epoch": 50.804597701149426, "grad_norm": 0.9516454935073853, "learning_rate": 1.825909961685824e-05, "loss": 11.8739, "step": 66300 }, { "epoch": 50.88122605363985, "grad_norm": 1.7874857187271118, "learning_rate": 1.8211206896551726e-05, "loss": 12.0046, "step": 66400 }, { "epoch": 50.95785440613027, "grad_norm": 1.1303731203079224, "learning_rate": 1.816331417624521e-05, "loss": 11.8135, "step": 66500 }, { "epoch": 51.0, "eval_loss": 12.762798309326172, "eval_runtime": 44.1477, "eval_samples_per_second": 29.56, "eval_steps_per_second": 3.715, "step": 66555 }, { "epoch": 51.03448275862069, "grad_norm": 2.8881723880767822, "learning_rate": 1.8115421455938696e-05, "loss": 11.8533, "step": 66600 }, { "epoch": 51.111111111111114, "grad_norm": 1.2278690338134766, "learning_rate": 1.8067528735632186e-05, "loss": 11.9214, "step": 66700 }, { "epoch": 51.18773946360153, "grad_norm": 1.9933656454086304, "learning_rate": 1.8019636015325673e-05, "loss": 11.8527, "step": 66800 }, { "epoch": 51.264367816091955, "grad_norm": 1.4205143451690674, "learning_rate": 1.7971743295019157e-05, "loss": 12.0251, "step": 66900 }, { "epoch": 51.34099616858238, "grad_norm": 1.319817304611206, "learning_rate": 1.7923850574712644e-05, "loss": 12.0983, "step": 67000 }, { "epoch": 51.417624521072796, "grad_norm": 1.6209360361099243, "learning_rate": 1.787595785440613e-05, "loss": 11.8053, "step": 67100 }, { "epoch": 51.49425287356322, "grad_norm": 1.0465126037597656, "learning_rate": 1.7828065134099617e-05, "loss": 12.0158, "step": 67200 }, { "epoch": 51.57088122605364, "grad_norm": 1.4087551832199097, "learning_rate": 1.7780172413793104e-05, "loss": 11.9305, "step": 67300 }, { "epoch": 51.64750957854406, "grad_norm": 1.121779203414917, "learning_rate": 1.773227969348659e-05, "loss": 12.1881, "step": 67400 }, { "epoch": 51.724137931034484, "grad_norm": 1.5989633798599243, "learning_rate": 1.7684386973180078e-05, "loss": 11.9698, "step": 67500 }, { "epoch": 51.8007662835249, "grad_norm": 1.1244069337844849, "learning_rate": 1.7636494252873565e-05, "loss": 11.7475, "step": 67600 }, { "epoch": 51.877394636015325, "grad_norm": 1.2594223022460938, "learning_rate": 1.7589080459770117e-05, "loss": 11.9611, "step": 67700 }, { "epoch": 51.95402298850575, "grad_norm": 1.6870946884155273, "learning_rate": 1.7541187739463604e-05, "loss": 11.7075, "step": 67800 }, { "epoch": 52.0, "eval_loss": 12.814347267150879, "eval_runtime": 44.0743, "eval_samples_per_second": 29.609, "eval_steps_per_second": 3.721, "step": 67860 }, { "epoch": 52.030651340996165, "grad_norm": 1.1319911479949951, "learning_rate": 1.7493295019157087e-05, "loss": 11.8327, "step": 67900 }, { "epoch": 52.10727969348659, "grad_norm": 1.0522786378860474, "learning_rate": 1.7445402298850574e-05, "loss": 11.858, "step": 68000 }, { "epoch": 52.18390804597701, "grad_norm": 1.7333852052688599, "learning_rate": 1.7397509578544064e-05, "loss": 12.037, "step": 68100 }, { "epoch": 52.26053639846743, "grad_norm": 1.7924898862838745, "learning_rate": 1.734961685823755e-05, "loss": 12.0778, "step": 68200 }, { "epoch": 52.337164750957854, "grad_norm": 1.221550464630127, "learning_rate": 1.7301724137931035e-05, "loss": 12.1365, "step": 68300 }, { "epoch": 52.41379310344828, "grad_norm": 1.6241466999053955, "learning_rate": 1.725383141762452e-05, "loss": 12.0267, "step": 68400 }, { "epoch": 52.490421455938694, "grad_norm": 1.7579493522644043, "learning_rate": 1.720593869731801e-05, "loss": 11.7834, "step": 68500 }, { "epoch": 52.56704980842912, "grad_norm": 1.4909967184066772, "learning_rate": 1.7158045977011495e-05, "loss": 11.9632, "step": 68600 }, { "epoch": 52.64367816091954, "grad_norm": 2.0708203315734863, "learning_rate": 1.7110153256704982e-05, "loss": 11.9318, "step": 68700 }, { "epoch": 52.72030651340996, "grad_norm": 1.1900310516357422, "learning_rate": 1.706226053639847e-05, "loss": 11.8145, "step": 68800 }, { "epoch": 52.79693486590038, "grad_norm": 1.2245934009552002, "learning_rate": 1.7014367816091956e-05, "loss": 11.6663, "step": 68900 }, { "epoch": 52.87356321839081, "grad_norm": 1.6178796291351318, "learning_rate": 1.6966475095785443e-05, "loss": 11.9844, "step": 69000 }, { "epoch": 52.95019157088122, "grad_norm": 1.2077674865722656, "learning_rate": 1.691858237547893e-05, "loss": 11.6393, "step": 69100 }, { "epoch": 53.0, "eval_loss": 12.77491283416748, "eval_runtime": 44.1152, "eval_samples_per_second": 29.582, "eval_steps_per_second": 3.718, "step": 69165 }, { "epoch": 53.02681992337165, "grad_norm": 1.2087703943252563, "learning_rate": 1.6870689655172413e-05, "loss": 11.8316, "step": 69200 }, { "epoch": 53.10344827586207, "grad_norm": 1.472959280014038, "learning_rate": 1.68227969348659e-05, "loss": 11.9068, "step": 69300 }, { "epoch": 53.18007662835249, "grad_norm": 1.2973859310150146, "learning_rate": 1.6774904214559387e-05, "loss": 11.8753, "step": 69400 }, { "epoch": 53.25670498084291, "grad_norm": 1.3909817934036255, "learning_rate": 1.6727011494252877e-05, "loss": 11.6868, "step": 69500 }, { "epoch": 53.333333333333336, "grad_norm": 1.1226869821548462, "learning_rate": 1.667911877394636e-05, "loss": 11.7399, "step": 69600 }, { "epoch": 53.40996168582375, "grad_norm": 1.6086245775222778, "learning_rate": 1.6631226053639847e-05, "loss": 11.9871, "step": 69700 }, { "epoch": 53.486590038314176, "grad_norm": 5.143097400665283, "learning_rate": 1.65838122605364e-05, "loss": 12.0991, "step": 69800 }, { "epoch": 53.5632183908046, "grad_norm": 1.1883777379989624, "learning_rate": 1.6535919540229886e-05, "loss": 11.7275, "step": 69900 }, { "epoch": 53.63984674329502, "grad_norm": 1.152468204498291, "learning_rate": 1.6488026819923373e-05, "loss": 11.9268, "step": 70000 }, { "epoch": 53.71647509578544, "grad_norm": 1.6981552839279175, "learning_rate": 1.6440134099616856e-05, "loss": 12.0293, "step": 70100 }, { "epoch": 53.793103448275865, "grad_norm": 1.6067506074905396, "learning_rate": 1.6392241379310347e-05, "loss": 11.9477, "step": 70200 }, { "epoch": 53.86973180076628, "grad_norm": 3.569709539413452, "learning_rate": 1.6344348659003834e-05, "loss": 11.8055, "step": 70300 }, { "epoch": 53.946360153256705, "grad_norm": 2.3322157859802246, "learning_rate": 1.629645593869732e-05, "loss": 12.027, "step": 70400 }, { "epoch": 54.0, "eval_loss": 12.753838539123535, "eval_runtime": 44.1081, "eval_samples_per_second": 29.586, "eval_steps_per_second": 3.718, "step": 70470 }, { "epoch": 54.02298850574713, "grad_norm": 1.4370397329330444, "learning_rate": 1.6248563218390804e-05, "loss": 12.0639, "step": 70500 }, { "epoch": 54.099616858237546, "grad_norm": 2.486645221710205, "learning_rate": 1.620067049808429e-05, "loss": 11.9231, "step": 70600 }, { "epoch": 54.17624521072797, "grad_norm": 2.0936434268951416, "learning_rate": 1.6152777777777778e-05, "loss": 11.9161, "step": 70700 }, { "epoch": 54.252873563218394, "grad_norm": 1.5211490392684937, "learning_rate": 1.6104885057471265e-05, "loss": 11.9338, "step": 70800 }, { "epoch": 54.32950191570881, "grad_norm": 1.035090684890747, "learning_rate": 1.605699233716475e-05, "loss": 11.7872, "step": 70900 }, { "epoch": 54.406130268199234, "grad_norm": 1.617077112197876, "learning_rate": 1.600909961685824e-05, "loss": 11.9772, "step": 71000 }, { "epoch": 54.48275862068966, "grad_norm": 1.3988826274871826, "learning_rate": 1.5961206896551725e-05, "loss": 12.0088, "step": 71100 }, { "epoch": 54.559386973180075, "grad_norm": 1.7126933336257935, "learning_rate": 1.5913314176245212e-05, "loss": 11.9831, "step": 71200 }, { "epoch": 54.6360153256705, "grad_norm": 2.3251850605010986, "learning_rate": 1.58654214559387e-05, "loss": 11.7345, "step": 71300 }, { "epoch": 54.71264367816092, "grad_norm": 1.6456447839736938, "learning_rate": 1.5817528735632183e-05, "loss": 12.0158, "step": 71400 }, { "epoch": 54.78927203065134, "grad_norm": 2.1808829307556152, "learning_rate": 1.576963601532567e-05, "loss": 12.0169, "step": 71500 }, { "epoch": 54.86590038314176, "grad_norm": 2.2233774662017822, "learning_rate": 1.572174329501916e-05, "loss": 11.9144, "step": 71600 }, { "epoch": 54.94252873563218, "grad_norm": 1.5419303178787231, "learning_rate": 1.5673850574712647e-05, "loss": 11.7915, "step": 71700 }, { "epoch": 55.0, "eval_loss": 12.735248565673828, "eval_runtime": 44.0526, "eval_samples_per_second": 29.624, "eval_steps_per_second": 3.723, "step": 71775 }, { "epoch": 55.019157088122604, "grad_norm": 2.4967896938323975, "learning_rate": 1.562595785440613e-05, "loss": 12.1777, "step": 71800 }, { "epoch": 55.09578544061303, "grad_norm": 1.6103179454803467, "learning_rate": 1.5578065134099617e-05, "loss": 12.0236, "step": 71900 }, { "epoch": 55.172413793103445, "grad_norm": 1.058643102645874, "learning_rate": 1.553065134099617e-05, "loss": 11.9485, "step": 72000 }, { "epoch": 55.24904214559387, "grad_norm": 1.1860133409500122, "learning_rate": 1.5482758620689656e-05, "loss": 11.7885, "step": 72100 }, { "epoch": 55.32567049808429, "grad_norm": 2.6516213417053223, "learning_rate": 1.5434865900383142e-05, "loss": 11.8373, "step": 72200 }, { "epoch": 55.40229885057471, "grad_norm": 1.3108186721801758, "learning_rate": 1.538697318007663e-05, "loss": 11.8938, "step": 72300 }, { "epoch": 55.47892720306513, "grad_norm": 2.721954345703125, "learning_rate": 1.5339080459770116e-05, "loss": 11.873, "step": 72400 }, { "epoch": 55.55555555555556, "grad_norm": 1.0352996587753296, "learning_rate": 1.5291187739463603e-05, "loss": 12.025, "step": 72500 }, { "epoch": 55.632183908045974, "grad_norm": 1.258169412612915, "learning_rate": 1.5243295019157088e-05, "loss": 11.9444, "step": 72600 }, { "epoch": 55.7088122605364, "grad_norm": 2.314866781234741, "learning_rate": 1.5195402298850575e-05, "loss": 11.711, "step": 72700 }, { "epoch": 55.78544061302682, "grad_norm": 1.308590292930603, "learning_rate": 1.5147509578544062e-05, "loss": 12.0446, "step": 72800 }, { "epoch": 55.86206896551724, "grad_norm": 2.928891897201538, "learning_rate": 1.5099616858237547e-05, "loss": 11.9413, "step": 72900 }, { "epoch": 55.93869731800766, "grad_norm": 1.048743724822998, "learning_rate": 1.5051724137931036e-05, "loss": 11.791, "step": 73000 }, { "epoch": 56.0, "eval_loss": 12.750344276428223, "eval_runtime": 44.088, "eval_samples_per_second": 29.6, "eval_steps_per_second": 3.72, "step": 73080 }, { "epoch": 56.015325670498086, "grad_norm": 3.6337478160858154, "learning_rate": 1.5003831417624523e-05, "loss": 11.9951, "step": 73100 }, { "epoch": 56.0919540229885, "grad_norm": 1.7665445804595947, "learning_rate": 1.495593869731801e-05, "loss": 12.1332, "step": 73200 }, { "epoch": 56.16858237547893, "grad_norm": 1.4894465208053589, "learning_rate": 1.4908045977011495e-05, "loss": 11.7198, "step": 73300 }, { "epoch": 56.24521072796935, "grad_norm": 1.0169578790664673, "learning_rate": 1.4860153256704982e-05, "loss": 12.0523, "step": 73400 }, { "epoch": 56.32183908045977, "grad_norm": 1.2872236967086792, "learning_rate": 1.4812260536398467e-05, "loss": 11.8438, "step": 73500 }, { "epoch": 56.39846743295019, "grad_norm": 1.1032931804656982, "learning_rate": 1.4764367816091954e-05, "loss": 11.9058, "step": 73600 }, { "epoch": 56.475095785440615, "grad_norm": 1.4371570348739624, "learning_rate": 1.4716475095785442e-05, "loss": 11.9199, "step": 73700 }, { "epoch": 56.55172413793103, "grad_norm": 1.9667787551879883, "learning_rate": 1.4668582375478929e-05, "loss": 11.899, "step": 73800 }, { "epoch": 56.628352490421456, "grad_norm": 1.2465131282806396, "learning_rate": 1.4620689655172414e-05, "loss": 11.9303, "step": 73900 }, { "epoch": 56.70498084291188, "grad_norm": 1.2738486528396606, "learning_rate": 1.4573275862068966e-05, "loss": 11.9897, "step": 74000 }, { "epoch": 56.7816091954023, "grad_norm": 1.7295411825180054, "learning_rate": 1.4525383141762453e-05, "loss": 11.989, "step": 74100 }, { "epoch": 56.85823754789272, "grad_norm": 3.2072668075561523, "learning_rate": 1.4477490421455938e-05, "loss": 11.8107, "step": 74200 }, { "epoch": 56.934865900383144, "grad_norm": 1.3828212022781372, "learning_rate": 1.4429597701149425e-05, "loss": 11.7899, "step": 74300 }, { "epoch": 57.0, "eval_loss": 12.746719360351562, "eval_runtime": 44.0689, "eval_samples_per_second": 29.613, "eval_steps_per_second": 3.721, "step": 74385 }, { "epoch": 57.01149425287356, "grad_norm": 1.1235148906707764, "learning_rate": 1.4381704980842914e-05, "loss": 11.9095, "step": 74400 }, { "epoch": 57.088122605363985, "grad_norm": 1.3013513088226318, "learning_rate": 1.43338122605364e-05, "loss": 11.8367, "step": 74500 }, { "epoch": 57.16475095785441, "grad_norm": 1.46478271484375, "learning_rate": 1.4285919540229886e-05, "loss": 11.8926, "step": 74600 }, { "epoch": 57.241379310344826, "grad_norm": 1.7883129119873047, "learning_rate": 1.4238026819923373e-05, "loss": 11.7109, "step": 74700 }, { "epoch": 57.31800766283525, "grad_norm": 2.2156434059143066, "learning_rate": 1.419013409961686e-05, "loss": 11.9904, "step": 74800 }, { "epoch": 57.39463601532567, "grad_norm": 1.963996410369873, "learning_rate": 1.4142241379310345e-05, "loss": 11.8243, "step": 74900 }, { "epoch": 57.47126436781609, "grad_norm": 1.5265462398529053, "learning_rate": 1.4094348659003831e-05, "loss": 11.982, "step": 75000 }, { "epoch": 57.547892720306514, "grad_norm": 1.5820256471633911, "learning_rate": 1.404645593869732e-05, "loss": 12.0055, "step": 75100 }, { "epoch": 57.62452107279694, "grad_norm": 1.2654030323028564, "learning_rate": 1.3998563218390807e-05, "loss": 11.8634, "step": 75200 }, { "epoch": 57.701149425287355, "grad_norm": 2.1730732917785645, "learning_rate": 1.3950670498084292e-05, "loss": 12.098, "step": 75300 }, { "epoch": 57.77777777777778, "grad_norm": 1.7732394933700562, "learning_rate": 1.3902777777777779e-05, "loss": 11.856, "step": 75400 }, { "epoch": 57.8544061302682, "grad_norm": 1.366039514541626, "learning_rate": 1.3854885057471264e-05, "loss": 12.0139, "step": 75500 }, { "epoch": 57.93103448275862, "grad_norm": 2.9070754051208496, "learning_rate": 1.3806992337164751e-05, "loss": 11.9716, "step": 75600 }, { "epoch": 58.0, "eval_loss": 12.731040000915527, "eval_runtime": 44.0877, "eval_samples_per_second": 29.6, "eval_steps_per_second": 3.72, "step": 75690 }, { "epoch": 58.00766283524904, "grad_norm": 2.1817991733551025, "learning_rate": 1.3759099616858236e-05, "loss": 11.906, "step": 75700 }, { "epoch": 58.08429118773947, "grad_norm": 1.2766177654266357, "learning_rate": 1.3711206896551726e-05, "loss": 12.0479, "step": 75800 }, { "epoch": 58.160919540229884, "grad_norm": 2.82973575592041, "learning_rate": 1.3663314176245212e-05, "loss": 11.947, "step": 75900 }, { "epoch": 58.23754789272031, "grad_norm": 1.2385036945343018, "learning_rate": 1.3615421455938699e-05, "loss": 11.9196, "step": 76000 }, { "epoch": 58.31417624521073, "grad_norm": 1.3823829889297485, "learning_rate": 1.3567528735632184e-05, "loss": 11.9057, "step": 76100 }, { "epoch": 58.39080459770115, "grad_norm": 1.472506046295166, "learning_rate": 1.351963601532567e-05, "loss": 11.9563, "step": 76200 }, { "epoch": 58.46743295019157, "grad_norm": 1.5811665058135986, "learning_rate": 1.3472222222222222e-05, "loss": 11.8257, "step": 76300 }, { "epoch": 58.54406130268199, "grad_norm": 1.5588597059249878, "learning_rate": 1.3424329501915708e-05, "loss": 11.8564, "step": 76400 }, { "epoch": 58.62068965517241, "grad_norm": 1.5810322761535645, "learning_rate": 1.3376436781609198e-05, "loss": 11.8566, "step": 76500 }, { "epoch": 58.69731800766284, "grad_norm": 1.5648218393325806, "learning_rate": 1.3328544061302683e-05, "loss": 11.9988, "step": 76600 }, { "epoch": 58.77394636015325, "grad_norm": 1.8077315092086792, "learning_rate": 1.328065134099617e-05, "loss": 11.7739, "step": 76700 }, { "epoch": 58.85057471264368, "grad_norm": 1.1517853736877441, "learning_rate": 1.3232758620689655e-05, "loss": 11.9046, "step": 76800 }, { "epoch": 58.9272030651341, "grad_norm": 1.4639145135879517, "learning_rate": 1.3184865900383142e-05, "loss": 11.99, "step": 76900 }, { "epoch": 59.0, "eval_loss": 12.737883567810059, "eval_runtime": 44.0757, "eval_samples_per_second": 29.608, "eval_steps_per_second": 3.721, "step": 76995 }, { "epoch": 59.00383141762452, "grad_norm": 0.9936187267303467, "learning_rate": 1.3136973180076629e-05, "loss": 11.9348, "step": 77000 }, { "epoch": 59.08045977011494, "grad_norm": 1.227501630783081, "learning_rate": 1.3089080459770114e-05, "loss": 11.9054, "step": 77100 }, { "epoch": 59.157088122605366, "grad_norm": 1.1214205026626587, "learning_rate": 1.3041187739463603e-05, "loss": 11.7912, "step": 77200 }, { "epoch": 59.23371647509578, "grad_norm": 1.3010284900665283, "learning_rate": 1.299329501915709e-05, "loss": 11.8542, "step": 77300 }, { "epoch": 59.310344827586206, "grad_norm": 1.291937232017517, "learning_rate": 1.2945402298850576e-05, "loss": 11.8613, "step": 77400 }, { "epoch": 59.38697318007663, "grad_norm": 1.224834680557251, "learning_rate": 1.2897509578544062e-05, "loss": 11.905, "step": 77500 }, { "epoch": 59.46360153256705, "grad_norm": 1.308899998664856, "learning_rate": 1.2849616858237548e-05, "loss": 11.9067, "step": 77600 }, { "epoch": 59.54022988505747, "grad_norm": 1.4333239793777466, "learning_rate": 1.2801724137931034e-05, "loss": 11.8825, "step": 77700 }, { "epoch": 59.616858237547895, "grad_norm": 1.0542117357254028, "learning_rate": 1.275383141762452e-05, "loss": 12.1948, "step": 77800 }, { "epoch": 59.69348659003831, "grad_norm": 1.9502829313278198, "learning_rate": 1.2705938697318009e-05, "loss": 11.9644, "step": 77900 }, { "epoch": 59.770114942528735, "grad_norm": 1.3281497955322266, "learning_rate": 1.2658045977011496e-05, "loss": 11.8953, "step": 78000 }, { "epoch": 59.84674329501916, "grad_norm": 1.2546237707138062, "learning_rate": 1.2610153256704981e-05, "loss": 11.8375, "step": 78100 }, { "epoch": 59.923371647509576, "grad_norm": 1.1630369424819946, "learning_rate": 1.2562260536398468e-05, "loss": 11.7133, "step": 78200 }, { "epoch": 60.0, "grad_norm": 1.7483701705932617, "learning_rate": 1.2514367816091955e-05, "loss": 12.2012, "step": 78300 }, { "epoch": 60.0, "eval_loss": 12.731696128845215, "eval_runtime": 44.1463, "eval_samples_per_second": 29.561, "eval_steps_per_second": 3.715, "step": 78300 }, { "epoch": 60.076628352490424, "grad_norm": 2.260547399520874, "learning_rate": 1.2466954022988505e-05, "loss": 11.9756, "step": 78400 }, { "epoch": 60.15325670498084, "grad_norm": 1.387416124343872, "learning_rate": 1.2419061302681993e-05, "loss": 11.9715, "step": 78500 }, { "epoch": 60.229885057471265, "grad_norm": 4.537426948547363, "learning_rate": 1.2371168582375479e-05, "loss": 11.6355, "step": 78600 }, { "epoch": 60.30651340996169, "grad_norm": 1.930817723274231, "learning_rate": 1.2323275862068966e-05, "loss": 11.6992, "step": 78700 }, { "epoch": 60.383141762452105, "grad_norm": 1.7206836938858032, "learning_rate": 1.2275383141762452e-05, "loss": 11.8606, "step": 78800 }, { "epoch": 60.45977011494253, "grad_norm": 1.7796626091003418, "learning_rate": 1.222749042145594e-05, "loss": 11.8648, "step": 78900 }, { "epoch": 60.53639846743295, "grad_norm": 1.6132935285568237, "learning_rate": 1.2179597701149426e-05, "loss": 11.7958, "step": 79000 }, { "epoch": 60.61302681992337, "grad_norm": 1.2063769102096558, "learning_rate": 1.2131704980842913e-05, "loss": 11.8877, "step": 79100 }, { "epoch": 60.689655172413794, "grad_norm": 1.6793837547302246, "learning_rate": 1.20838122605364e-05, "loss": 11.9401, "step": 79200 }, { "epoch": 60.76628352490422, "grad_norm": 2.0831589698791504, "learning_rate": 1.2035919540229885e-05, "loss": 11.832, "step": 79300 }, { "epoch": 60.842911877394634, "grad_norm": 1.4812095165252686, "learning_rate": 1.1988026819923372e-05, "loss": 12.0039, "step": 79400 }, { "epoch": 60.91954022988506, "grad_norm": 2.111269474029541, "learning_rate": 1.1940134099616859e-05, "loss": 12.0629, "step": 79500 }, { "epoch": 60.99616858237548, "grad_norm": 1.0717095136642456, "learning_rate": 1.1892241379310346e-05, "loss": 11.7839, "step": 79600 }, { "epoch": 61.0, "eval_loss": 12.74968433380127, "eval_runtime": 44.1528, "eval_samples_per_second": 29.556, "eval_steps_per_second": 3.714, "step": 79605 }, { "epoch": 61.0727969348659, "grad_norm": 2.625854969024658, "learning_rate": 1.1844348659003831e-05, "loss": 11.9218, "step": 79700 }, { "epoch": 61.14942528735632, "grad_norm": 1.9146480560302734, "learning_rate": 1.179645593869732e-05, "loss": 11.6761, "step": 79800 }, { "epoch": 61.22605363984675, "grad_norm": 0.9696165919303894, "learning_rate": 1.1748563218390805e-05, "loss": 11.9288, "step": 79900 }, { "epoch": 61.30268199233716, "grad_norm": 1.1847577095031738, "learning_rate": 1.1700670498084292e-05, "loss": 11.9674, "step": 80000 }, { "epoch": 61.37931034482759, "grad_norm": 1.3804477453231812, "learning_rate": 1.1652777777777778e-05, "loss": 11.812, "step": 80100 }, { "epoch": 61.45593869731801, "grad_norm": 1.6096410751342773, "learning_rate": 1.1604885057471265e-05, "loss": 11.8585, "step": 80200 }, { "epoch": 61.53256704980843, "grad_norm": 1.8098353147506714, "learning_rate": 1.1556992337164752e-05, "loss": 11.8667, "step": 80300 }, { "epoch": 61.60919540229885, "grad_norm": 6.6866068840026855, "learning_rate": 1.1509099616858237e-05, "loss": 11.8999, "step": 80400 }, { "epoch": 61.68582375478927, "grad_norm": 2.7860629558563232, "learning_rate": 1.1461206896551726e-05, "loss": 11.8976, "step": 80500 }, { "epoch": 61.76245210727969, "grad_norm": 1.7936979532241821, "learning_rate": 1.1413314176245211e-05, "loss": 11.913, "step": 80600 }, { "epoch": 61.839080459770116, "grad_norm": 1.7207527160644531, "learning_rate": 1.1365421455938698e-05, "loss": 12.0002, "step": 80700 }, { "epoch": 61.91570881226053, "grad_norm": 2.8500571250915527, "learning_rate": 1.1317528735632183e-05, "loss": 12.0012, "step": 80800 }, { "epoch": 61.99233716475096, "grad_norm": 2.1529831886291504, "learning_rate": 1.1269636015325672e-05, "loss": 11.9888, "step": 80900 }, { "epoch": 62.0, "eval_loss": 12.742037773132324, "eval_runtime": 44.1517, "eval_samples_per_second": 29.557, "eval_steps_per_second": 3.714, "step": 80910 }, { "epoch": 62.06896551724138, "grad_norm": 1.1954108476638794, "learning_rate": 1.1221743295019157e-05, "loss": 11.9691, "step": 81000 }, { "epoch": 62.1455938697318, "grad_norm": 1.253891944885254, "learning_rate": 1.1174329501915709e-05, "loss": 12.0618, "step": 81100 }, { "epoch": 62.22222222222222, "grad_norm": 1.5132429599761963, "learning_rate": 1.1126436781609197e-05, "loss": 11.9311, "step": 81200 }, { "epoch": 62.298850574712645, "grad_norm": 1.215069055557251, "learning_rate": 1.1078544061302683e-05, "loss": 11.7015, "step": 81300 }, { "epoch": 62.37547892720306, "grad_norm": 2.0881459712982178, "learning_rate": 1.103065134099617e-05, "loss": 12.0909, "step": 81400 }, { "epoch": 62.452107279693486, "grad_norm": 1.079714298248291, "learning_rate": 1.0982758620689655e-05, "loss": 11.9608, "step": 81500 }, { "epoch": 62.52873563218391, "grad_norm": 1.3947062492370605, "learning_rate": 1.0934865900383143e-05, "loss": 11.8452, "step": 81600 }, { "epoch": 62.60536398467433, "grad_norm": 1.0822895765304565, "learning_rate": 1.0886973180076628e-05, "loss": 11.8232, "step": 81700 }, { "epoch": 62.68199233716475, "grad_norm": 1.6000736951828003, "learning_rate": 1.0839080459770115e-05, "loss": 11.994, "step": 81800 }, { "epoch": 62.758620689655174, "grad_norm": 1.6020923852920532, "learning_rate": 1.0791187739463602e-05, "loss": 11.9019, "step": 81900 }, { "epoch": 62.83524904214559, "grad_norm": 1.4164994955062866, "learning_rate": 1.0743295019157089e-05, "loss": 11.8139, "step": 82000 }, { "epoch": 62.911877394636015, "grad_norm": 2.334690570831299, "learning_rate": 1.0695402298850576e-05, "loss": 12.0714, "step": 82100 }, { "epoch": 62.98850574712644, "grad_norm": 1.8338385820388794, "learning_rate": 1.0647509578544061e-05, "loss": 11.8382, "step": 82200 }, { "epoch": 63.0, "eval_loss": 12.733258247375488, "eval_runtime": 44.1527, "eval_samples_per_second": 29.557, "eval_steps_per_second": 3.714, "step": 82215 }, { "epoch": 63.065134099616856, "grad_norm": 3.91227650642395, "learning_rate": 1.059961685823755e-05, "loss": 11.9929, "step": 82300 }, { "epoch": 63.14176245210728, "grad_norm": 1.1621551513671875, "learning_rate": 1.0551724137931035e-05, "loss": 11.9456, "step": 82400 }, { "epoch": 63.2183908045977, "grad_norm": 1.4154562950134277, "learning_rate": 1.0503831417624522e-05, "loss": 12.0645, "step": 82500 }, { "epoch": 63.29501915708812, "grad_norm": 1.8987462520599365, "learning_rate": 1.0455938697318009e-05, "loss": 11.873, "step": 82600 }, { "epoch": 63.371647509578544, "grad_norm": 1.8300188779830933, "learning_rate": 1.0408045977011495e-05, "loss": 11.7687, "step": 82700 }, { "epoch": 63.44827586206897, "grad_norm": 1.4220359325408936, "learning_rate": 1.036015325670498e-05, "loss": 11.8298, "step": 82800 }, { "epoch": 63.524904214559385, "grad_norm": 1.1422735452651978, "learning_rate": 1.0312260536398468e-05, "loss": 11.9857, "step": 82900 }, { "epoch": 63.60153256704981, "grad_norm": 1.6723980903625488, "learning_rate": 1.0264367816091954e-05, "loss": 11.6692, "step": 83000 }, { "epoch": 63.67816091954023, "grad_norm": 1.3438162803649902, "learning_rate": 1.0216954022988506e-05, "loss": 11.8703, "step": 83100 }, { "epoch": 63.75478927203065, "grad_norm": 1.2540138959884644, "learning_rate": 1.0169061302681993e-05, "loss": 11.8198, "step": 83200 }, { "epoch": 63.83141762452107, "grad_norm": 1.439274787902832, "learning_rate": 1.012116858237548e-05, "loss": 11.8904, "step": 83300 }, { "epoch": 63.9080459770115, "grad_norm": 1.0765241384506226, "learning_rate": 1.0073275862068967e-05, "loss": 11.8521, "step": 83400 }, { "epoch": 63.984674329501914, "grad_norm": 1.066419005393982, "learning_rate": 1.0025383141762452e-05, "loss": 11.8361, "step": 83500 }, { "epoch": 64.0, "eval_loss": 12.740053176879883, "eval_runtime": 44.1473, "eval_samples_per_second": 29.56, "eval_steps_per_second": 3.715, "step": 83520 }, { "epoch": 64.06130268199233, "grad_norm": 1.2648850679397583, "learning_rate": 9.977490421455939e-06, "loss": 12.1, "step": 83600 }, { "epoch": 64.13793103448276, "grad_norm": 1.115157961845398, "learning_rate": 9.929597701149426e-06, "loss": 11.798, "step": 83700 }, { "epoch": 64.21455938697318, "grad_norm": 1.6352553367614746, "learning_rate": 9.881704980842913e-06, "loss": 11.9761, "step": 83800 }, { "epoch": 64.2911877394636, "grad_norm": 1.2003965377807617, "learning_rate": 9.833812260536398e-06, "loss": 11.9813, "step": 83900 }, { "epoch": 64.36781609195403, "grad_norm": 1.5004589557647705, "learning_rate": 9.785919540229886e-06, "loss": 11.7826, "step": 84000 }, { "epoch": 64.44444444444444, "grad_norm": 1.3350985050201416, "learning_rate": 9.738026819923372e-06, "loss": 11.8015, "step": 84100 }, { "epoch": 64.52107279693486, "grad_norm": 1.5985853672027588, "learning_rate": 9.690134099616858e-06, "loss": 11.6736, "step": 84200 }, { "epoch": 64.59770114942529, "grad_norm": 2.1115546226501465, "learning_rate": 9.642241379310345e-06, "loss": 11.7572, "step": 84300 }, { "epoch": 64.67432950191571, "grad_norm": 2.5769665241241455, "learning_rate": 9.594348659003832e-06, "loss": 11.8057, "step": 84400 }, { "epoch": 64.75095785440612, "grad_norm": 3.2280073165893555, "learning_rate": 9.546455938697319e-06, "loss": 11.9184, "step": 84500 }, { "epoch": 64.82758620689656, "grad_norm": 1.2311729192733765, "learning_rate": 9.498563218390804e-06, "loss": 11.9657, "step": 84600 }, { "epoch": 64.90421455938697, "grad_norm": 1.6303430795669556, "learning_rate": 9.450670498084293e-06, "loss": 11.9864, "step": 84700 }, { "epoch": 64.98084291187739, "grad_norm": 1.6421687602996826, "learning_rate": 9.402777777777778e-06, "loss": 11.8224, "step": 84800 }, { "epoch": 65.0, "eval_loss": 12.752345085144043, "eval_runtime": 44.1763, "eval_samples_per_second": 29.541, "eval_steps_per_second": 3.712, "step": 84825 }, { "epoch": 65.05747126436782, "grad_norm": 1.2040326595306396, "learning_rate": 9.354885057471265e-06, "loss": 11.7626, "step": 84900 }, { "epoch": 65.13409961685824, "grad_norm": 1.1865389347076416, "learning_rate": 9.30699233716475e-06, "loss": 12.015, "step": 85000 }, { "epoch": 65.21072796934865, "grad_norm": 2.0402724742889404, "learning_rate": 9.259099616858239e-06, "loss": 11.8473, "step": 85100 }, { "epoch": 65.28735632183908, "grad_norm": 1.8505759239196777, "learning_rate": 9.21168582375479e-06, "loss": 11.9353, "step": 85200 }, { "epoch": 65.3639846743295, "grad_norm": 2.3651750087738037, "learning_rate": 9.163793103448276e-06, "loss": 12.0637, "step": 85300 }, { "epoch": 65.44061302681992, "grad_norm": 1.9731732606887817, "learning_rate": 9.115900383141762e-06, "loss": 12.0013, "step": 85400 }, { "epoch": 65.51724137931035, "grad_norm": 1.3928194046020508, "learning_rate": 9.06800766283525e-06, "loss": 11.6937, "step": 85500 }, { "epoch": 65.59386973180077, "grad_norm": 1.580771565437317, "learning_rate": 9.020114942528736e-06, "loss": 11.5997, "step": 85600 }, { "epoch": 65.67049808429118, "grad_norm": 1.143648624420166, "learning_rate": 8.972222222222221e-06, "loss": 11.948, "step": 85700 }, { "epoch": 65.74712643678161, "grad_norm": 1.9105567932128906, "learning_rate": 8.92432950191571e-06, "loss": 11.9796, "step": 85800 }, { "epoch": 65.82375478927203, "grad_norm": 1.3926714658737183, "learning_rate": 8.876436781609195e-06, "loss": 11.7775, "step": 85900 }, { "epoch": 65.90038314176245, "grad_norm": 1.1419901847839355, "learning_rate": 8.828544061302682e-06, "loss": 11.7615, "step": 86000 }, { "epoch": 65.97701149425288, "grad_norm": 1.6939061880111694, "learning_rate": 8.780651340996169e-06, "loss": 11.8244, "step": 86100 }, { "epoch": 66.0, "eval_loss": 12.737361907958984, "eval_runtime": 44.1505, "eval_samples_per_second": 29.558, "eval_steps_per_second": 3.715, "step": 86130 }, { "epoch": 66.0536398467433, "grad_norm": 1.953165054321289, "learning_rate": 8.732758620689656e-06, "loss": 11.9442, "step": 86200 }, { "epoch": 66.13026819923371, "grad_norm": 2.1596179008483887, "learning_rate": 8.684865900383143e-06, "loss": 11.764, "step": 86300 }, { "epoch": 66.20689655172414, "grad_norm": 1.4609719514846802, "learning_rate": 8.636973180076628e-06, "loss": 12.1997, "step": 86400 }, { "epoch": 66.28352490421456, "grad_norm": 2.0631511211395264, "learning_rate": 8.589080459770116e-06, "loss": 11.8684, "step": 86500 }, { "epoch": 66.36015325670498, "grad_norm": 1.4530664682388306, "learning_rate": 8.541187739463602e-06, "loss": 11.8307, "step": 86600 }, { "epoch": 66.4367816091954, "grad_norm": 2.148606777191162, "learning_rate": 8.493295019157089e-06, "loss": 11.9725, "step": 86700 }, { "epoch": 66.51340996168582, "grad_norm": 1.8974863290786743, "learning_rate": 8.445402298850575e-06, "loss": 11.9907, "step": 86800 }, { "epoch": 66.59003831417624, "grad_norm": 2.369657278060913, "learning_rate": 8.397509578544062e-06, "loss": 11.9563, "step": 86900 }, { "epoch": 66.66666666666667, "grad_norm": 1.6854480504989624, "learning_rate": 8.349616858237547e-06, "loss": 11.9173, "step": 87000 }, { "epoch": 66.74329501915709, "grad_norm": 1.6539610624313354, "learning_rate": 8.301724137931034e-06, "loss": 11.9584, "step": 87100 }, { "epoch": 66.8199233716475, "grad_norm": 1.346731424331665, "learning_rate": 8.253831417624521e-06, "loss": 11.7909, "step": 87200 }, { "epoch": 66.89655172413794, "grad_norm": 1.6548290252685547, "learning_rate": 8.206417624521073e-06, "loss": 11.9346, "step": 87300 }, { "epoch": 66.97318007662835, "grad_norm": 1.1189563274383545, "learning_rate": 8.15852490421456e-06, "loss": 11.9832, "step": 87400 }, { "epoch": 67.0, "eval_loss": 12.747148513793945, "eval_runtime": 44.147, "eval_samples_per_second": 29.56, "eval_steps_per_second": 3.715, "step": 87435 }, { "epoch": 67.04980842911877, "grad_norm": 1.7302024364471436, "learning_rate": 8.110632183908045e-06, "loss": 11.8374, "step": 87500 }, { "epoch": 67.1264367816092, "grad_norm": 0.8793215751647949, "learning_rate": 8.062739463601534e-06, "loss": 11.7415, "step": 87600 }, { "epoch": 67.20306513409962, "grad_norm": 1.1903204917907715, "learning_rate": 8.014846743295019e-06, "loss": 11.8223, "step": 87700 }, { "epoch": 67.27969348659003, "grad_norm": 2.025223731994629, "learning_rate": 7.966954022988506e-06, "loss": 11.7065, "step": 87800 }, { "epoch": 67.35632183908046, "grad_norm": 1.2028359174728394, "learning_rate": 7.919061302681993e-06, "loss": 11.9446, "step": 87900 }, { "epoch": 67.43295019157088, "grad_norm": 1.56088387966156, "learning_rate": 7.87116858237548e-06, "loss": 12.0176, "step": 88000 }, { "epoch": 67.5095785440613, "grad_norm": 1.4466462135314941, "learning_rate": 7.823275862068966e-06, "loss": 11.8777, "step": 88100 }, { "epoch": 67.58620689655173, "grad_norm": 2.2348804473876953, "learning_rate": 7.775383141762453e-06, "loss": 11.8506, "step": 88200 }, { "epoch": 67.66283524904215, "grad_norm": 1.0889838933944702, "learning_rate": 7.72749042145594e-06, "loss": 11.9706, "step": 88300 }, { "epoch": 67.73946360153256, "grad_norm": 1.6289935111999512, "learning_rate": 7.679597701149425e-06, "loss": 11.9588, "step": 88400 }, { "epoch": 67.816091954023, "grad_norm": 1.2480045557022095, "learning_rate": 7.631704980842912e-06, "loss": 11.7933, "step": 88500 }, { "epoch": 67.89272030651341, "grad_norm": 1.5679010152816772, "learning_rate": 7.583812260536399e-06, "loss": 12.0495, "step": 88600 }, { "epoch": 67.96934865900383, "grad_norm": 1.2820953130722046, "learning_rate": 7.535919540229885e-06, "loss": 11.8478, "step": 88700 }, { "epoch": 68.0, "eval_loss": 12.716951370239258, "eval_runtime": 44.1526, "eval_samples_per_second": 29.557, "eval_steps_per_second": 3.714, "step": 88740 }, { "epoch": 68.04597701149426, "grad_norm": 1.0503605604171753, "learning_rate": 7.488026819923372e-06, "loss": 11.9092, "step": 88800 }, { "epoch": 68.12260536398468, "grad_norm": 1.5500402450561523, "learning_rate": 7.440134099616859e-06, "loss": 11.933, "step": 88900 }, { "epoch": 68.19923371647509, "grad_norm": 2.4164953231811523, "learning_rate": 7.392241379310346e-06, "loss": 11.8528, "step": 89000 }, { "epoch": 68.27586206896552, "grad_norm": 1.7877123355865479, "learning_rate": 7.344348659003832e-06, "loss": 11.8459, "step": 89100 }, { "epoch": 68.35249042145594, "grad_norm": 1.6601005792617798, "learning_rate": 7.296455938697318e-06, "loss": 11.8986, "step": 89200 }, { "epoch": 68.42911877394636, "grad_norm": 1.6431148052215576, "learning_rate": 7.24904214559387e-06, "loss": 11.8467, "step": 89300 }, { "epoch": 68.50574712643679, "grad_norm": 1.2147421836853027, "learning_rate": 7.201149425287357e-06, "loss": 11.9989, "step": 89400 }, { "epoch": 68.5823754789272, "grad_norm": 1.0646436214447021, "learning_rate": 7.153256704980843e-06, "loss": 11.6439, "step": 89500 }, { "epoch": 68.65900383141762, "grad_norm": 1.494936466217041, "learning_rate": 7.105363984674329e-06, "loss": 11.8232, "step": 89600 }, { "epoch": 68.73563218390805, "grad_norm": 1.1928653717041016, "learning_rate": 7.057471264367817e-06, "loss": 12.032, "step": 89700 }, { "epoch": 68.81226053639847, "grad_norm": 1.2193999290466309, "learning_rate": 7.009578544061303e-06, "loss": 11.8999, "step": 89800 }, { "epoch": 68.88888888888889, "grad_norm": 1.418272852897644, "learning_rate": 6.961685823754789e-06, "loss": 12.0139, "step": 89900 }, { "epoch": 68.96551724137932, "grad_norm": 2.331040620803833, "learning_rate": 6.913793103448277e-06, "loss": 12.0201, "step": 90000 }, { "epoch": 69.0, "eval_loss": 12.731438636779785, "eval_runtime": 44.1419, "eval_samples_per_second": 29.564, "eval_steps_per_second": 3.715, "step": 90045 }, { "epoch": 69.04214559386973, "grad_norm": 1.2469091415405273, "learning_rate": 6.865900383141763e-06, "loss": 11.7182, "step": 90100 }, { "epoch": 69.11877394636015, "grad_norm": 1.299902319908142, "learning_rate": 6.818007662835249e-06, "loss": 11.908, "step": 90200 }, { "epoch": 69.19540229885058, "grad_norm": 2.0446414947509766, "learning_rate": 6.770114942528737e-06, "loss": 11.8736, "step": 90300 }, { "epoch": 69.272030651341, "grad_norm": 2.1058554649353027, "learning_rate": 6.722222222222223e-06, "loss": 11.7726, "step": 90400 }, { "epoch": 69.34865900383141, "grad_norm": 1.222571849822998, "learning_rate": 6.674329501915709e-06, "loss": 12.1008, "step": 90500 }, { "epoch": 69.42528735632185, "grad_norm": 1.2086107730865479, "learning_rate": 6.6264367816091955e-06, "loss": 11.9332, "step": 90600 }, { "epoch": 69.50191570881226, "grad_norm": 1.188658356666565, "learning_rate": 6.578544061302682e-06, "loss": 11.9603, "step": 90700 }, { "epoch": 69.57854406130268, "grad_norm": 1.1233985424041748, "learning_rate": 6.530651340996169e-06, "loss": 11.7879, "step": 90800 }, { "epoch": 69.65517241379311, "grad_norm": 1.8599299192428589, "learning_rate": 6.482758620689655e-06, "loss": 12.0864, "step": 90900 }, { "epoch": 69.73180076628353, "grad_norm": 1.213908076286316, "learning_rate": 6.434865900383143e-06, "loss": 11.7091, "step": 91000 }, { "epoch": 69.80842911877394, "grad_norm": 1.2682372331619263, "learning_rate": 6.386973180076629e-06, "loss": 11.8762, "step": 91100 }, { "epoch": 69.88505747126437, "grad_norm": 1.940184473991394, "learning_rate": 6.339080459770115e-06, "loss": 11.6487, "step": 91200 }, { "epoch": 69.96168582375479, "grad_norm": 1.4338123798370361, "learning_rate": 6.291187739463601e-06, "loss": 12.152, "step": 91300 }, { "epoch": 70.0, "eval_loss": 12.735883712768555, "eval_runtime": 44.179, "eval_samples_per_second": 29.539, "eval_steps_per_second": 3.712, "step": 91350 }, { "epoch": 70.03831417624521, "grad_norm": 2.018376111984253, "learning_rate": 6.243295019157088e-06, "loss": 11.9978, "step": 91400 }, { "epoch": 70.11494252873563, "grad_norm": 1.4965932369232178, "learning_rate": 6.195881226053641e-06, "loss": 11.9588, "step": 91500 }, { "epoch": 70.19157088122606, "grad_norm": 1.5459176301956177, "learning_rate": 6.147988505747127e-06, "loss": 11.7513, "step": 91600 }, { "epoch": 70.26819923371647, "grad_norm": 1.6559784412384033, "learning_rate": 6.1000957854406135e-06, "loss": 11.8124, "step": 91700 }, { "epoch": 70.34482758620689, "grad_norm": 2.100288152694702, "learning_rate": 6.0522030651341e-06, "loss": 11.8001, "step": 91800 }, { "epoch": 70.42145593869732, "grad_norm": 2.0167760848999023, "learning_rate": 6.0043103448275864e-06, "loss": 11.7079, "step": 91900 }, { "epoch": 70.49808429118774, "grad_norm": 1.2484099864959717, "learning_rate": 5.956417624521073e-06, "loss": 11.8747, "step": 92000 }, { "epoch": 70.57471264367815, "grad_norm": 1.4585705995559692, "learning_rate": 5.908524904214559e-06, "loss": 11.6371, "step": 92100 }, { "epoch": 70.65134099616859, "grad_norm": 1.2680083513259888, "learning_rate": 5.860632183908046e-06, "loss": 11.8783, "step": 92200 }, { "epoch": 70.727969348659, "grad_norm": 3.2429590225219727, "learning_rate": 5.812739463601532e-06, "loss": 12.0867, "step": 92300 }, { "epoch": 70.80459770114942, "grad_norm": 1.6496800184249878, "learning_rate": 5.764846743295019e-06, "loss": 11.8665, "step": 92400 }, { "epoch": 70.88122605363985, "grad_norm": 1.7092400789260864, "learning_rate": 5.716954022988506e-06, "loss": 11.8957, "step": 92500 }, { "epoch": 70.95785440613027, "grad_norm": 1.308349370956421, "learning_rate": 5.669061302681993e-06, "loss": 11.6562, "step": 92600 }, { "epoch": 71.0, "eval_loss": 12.738100051879883, "eval_runtime": 44.1855, "eval_samples_per_second": 29.535, "eval_steps_per_second": 3.712, "step": 92655 }, { "epoch": 71.03448275862068, "grad_norm": 1.4456454515457153, "learning_rate": 5.62116858237548e-06, "loss": 11.9577, "step": 92700 }, { "epoch": 71.11111111111111, "grad_norm": 1.178861141204834, "learning_rate": 5.573275862068966e-06, "loss": 11.7769, "step": 92800 }, { "epoch": 71.18773946360153, "grad_norm": 1.2721989154815674, "learning_rate": 5.525383141762453e-06, "loss": 12.0604, "step": 92900 }, { "epoch": 71.26436781609195, "grad_norm": 1.4360485076904297, "learning_rate": 5.4774904214559396e-06, "loss": 11.853, "step": 93000 }, { "epoch": 71.34099616858238, "grad_norm": 1.1324783563613892, "learning_rate": 5.429597701149426e-06, "loss": 12.0389, "step": 93100 }, { "epoch": 71.4176245210728, "grad_norm": 1.327430009841919, "learning_rate": 5.3817049808429125e-06, "loss": 12.1736, "step": 93200 }, { "epoch": 71.49425287356321, "grad_norm": 1.7536532878875732, "learning_rate": 5.3338122605363985e-06, "loss": 11.8394, "step": 93300 }, { "epoch": 71.57088122605364, "grad_norm": 1.2314512729644775, "learning_rate": 5.285919540229885e-06, "loss": 11.8958, "step": 93400 }, { "epoch": 71.64750957854406, "grad_norm": 1.3814700841903687, "learning_rate": 5.2380268199233714e-06, "loss": 11.8036, "step": 93500 }, { "epoch": 71.72413793103448, "grad_norm": 1.6986061334609985, "learning_rate": 5.190134099616858e-06, "loss": 11.7598, "step": 93600 }, { "epoch": 71.80076628352491, "grad_norm": 1.1988410949707031, "learning_rate": 5.142241379310345e-06, "loss": 11.7643, "step": 93700 }, { "epoch": 71.87739463601532, "grad_norm": 1.005979061126709, "learning_rate": 5.094827586206897e-06, "loss": 11.8694, "step": 93800 }, { "epoch": 71.95402298850574, "grad_norm": 1.8171489238739014, "learning_rate": 5.046934865900384e-06, "loss": 11.7541, "step": 93900 }, { "epoch": 72.0, "eval_loss": 12.730957984924316, "eval_runtime": 44.1811, "eval_samples_per_second": 29.538, "eval_steps_per_second": 3.712, "step": 93960 }, { "epoch": 72.03065134099617, "grad_norm": 1.2113227844238281, "learning_rate": 4.99904214559387e-06, "loss": 11.8434, "step": 94000 }, { "epoch": 72.10727969348659, "grad_norm": 1.9516360759735107, "learning_rate": 4.951149425287357e-06, "loss": 12.0732, "step": 94100 }, { "epoch": 72.183908045977, "grad_norm": 1.6725817918777466, "learning_rate": 4.903256704980843e-06, "loss": 11.9187, "step": 94200 }, { "epoch": 72.26053639846744, "grad_norm": 1.5325151681900024, "learning_rate": 4.85536398467433e-06, "loss": 11.8286, "step": 94300 }, { "epoch": 72.33716475095785, "grad_norm": 1.4346359968185425, "learning_rate": 4.807471264367816e-06, "loss": 11.9449, "step": 94400 }, { "epoch": 72.41379310344827, "grad_norm": 1.8294119834899902, "learning_rate": 4.7595785440613025e-06, "loss": 11.7885, "step": 94500 }, { "epoch": 72.4904214559387, "grad_norm": 3.0054831504821777, "learning_rate": 4.7116858237547894e-06, "loss": 11.9011, "step": 94600 }, { "epoch": 72.56704980842912, "grad_norm": 3.023944616317749, "learning_rate": 4.663793103448276e-06, "loss": 11.7951, "step": 94700 }, { "epoch": 72.64367816091954, "grad_norm": 1.6727356910705566, "learning_rate": 4.615900383141763e-06, "loss": 11.6363, "step": 94800 }, { "epoch": 72.72030651340997, "grad_norm": 2.4141032695770264, "learning_rate": 4.568007662835249e-06, "loss": 11.8062, "step": 94900 }, { "epoch": 72.79693486590038, "grad_norm": 1.810632348060608, "learning_rate": 4.520114942528736e-06, "loss": 11.7885, "step": 95000 }, { "epoch": 72.8735632183908, "grad_norm": 1.2663646936416626, "learning_rate": 4.472222222222222e-06, "loss": 11.8532, "step": 95100 }, { "epoch": 72.95019157088123, "grad_norm": 1.1440293788909912, "learning_rate": 4.424329501915709e-06, "loss": 11.9398, "step": 95200 }, { "epoch": 73.0, "eval_loss": 12.724422454833984, "eval_runtime": 44.1981, "eval_samples_per_second": 29.526, "eval_steps_per_second": 3.711, "step": 95265 }, { "epoch": 73.02681992337165, "grad_norm": 1.0655268430709839, "learning_rate": 4.376436781609196e-06, "loss": 11.9855, "step": 95300 }, { "epoch": 73.10344827586206, "grad_norm": 1.2701817750930786, "learning_rate": 4.328544061302682e-06, "loss": 11.7504, "step": 95400 }, { "epoch": 73.1800766283525, "grad_norm": 1.4740400314331055, "learning_rate": 4.280651340996169e-06, "loss": 11.8391, "step": 95500 }, { "epoch": 73.25670498084291, "grad_norm": 2.1387853622436523, "learning_rate": 4.232758620689655e-06, "loss": 11.8052, "step": 95600 }, { "epoch": 73.33333333333333, "grad_norm": 1.295242190361023, "learning_rate": 4.184865900383142e-06, "loss": 11.9859, "step": 95700 }, { "epoch": 73.40996168582376, "grad_norm": 1.4711384773254395, "learning_rate": 4.136973180076629e-06, "loss": 12.1523, "step": 95800 }, { "epoch": 73.48659003831418, "grad_norm": 1.7779674530029297, "learning_rate": 4.089080459770115e-06, "loss": 11.698, "step": 95900 }, { "epoch": 73.5632183908046, "grad_norm": 2.6070003509521484, "learning_rate": 4.0411877394636015e-06, "loss": 11.9877, "step": 96000 }, { "epoch": 73.63984674329502, "grad_norm": 1.4775136709213257, "learning_rate": 3.993295019157088e-06, "loss": 11.7928, "step": 96100 }, { "epoch": 73.71647509578544, "grad_norm": 1.7105778455734253, "learning_rate": 3.945402298850575e-06, "loss": 12.0444, "step": 96200 }, { "epoch": 73.79310344827586, "grad_norm": 1.6719238758087158, "learning_rate": 3.897988505747126e-06, "loss": 11.9407, "step": 96300 }, { "epoch": 73.86973180076629, "grad_norm": 1.312474250793457, "learning_rate": 3.850095785440613e-06, "loss": 11.7468, "step": 96400 }, { "epoch": 73.9463601532567, "grad_norm": 0.9431168437004089, "learning_rate": 3.8022030651340995e-06, "loss": 11.8737, "step": 96500 }, { "epoch": 74.0, "eval_loss": 12.720576286315918, "eval_runtime": 44.1482, "eval_samples_per_second": 29.56, "eval_steps_per_second": 3.715, "step": 96570 }, { "epoch": 74.02298850574712, "grad_norm": 1.6064398288726807, "learning_rate": 3.7543103448275864e-06, "loss": 11.8828, "step": 96600 }, { "epoch": 74.09961685823755, "grad_norm": 2.088803768157959, "learning_rate": 3.7064176245210733e-06, "loss": 11.7576, "step": 96700 }, { "epoch": 74.17624521072797, "grad_norm": 1.5417454242706299, "learning_rate": 3.6585249042145593e-06, "loss": 11.9239, "step": 96800 }, { "epoch": 74.25287356321839, "grad_norm": 1.5983319282531738, "learning_rate": 3.610632183908046e-06, "loss": 11.8119, "step": 96900 }, { "epoch": 74.32950191570882, "grad_norm": 3.7642099857330322, "learning_rate": 3.5627394636015326e-06, "loss": 11.8259, "step": 97000 }, { "epoch": 74.40613026819923, "grad_norm": 1.5149072408676147, "learning_rate": 3.5148467432950195e-06, "loss": 11.9898, "step": 97100 }, { "epoch": 74.48275862068965, "grad_norm": 0.9915036559104919, "learning_rate": 3.4669540229885055e-06, "loss": 11.7665, "step": 97200 }, { "epoch": 74.55938697318008, "grad_norm": 1.2745176553726196, "learning_rate": 3.4190613026819924e-06, "loss": 11.9657, "step": 97300 }, { "epoch": 74.6360153256705, "grad_norm": 2.390751600265503, "learning_rate": 3.3711685823754793e-06, "loss": 11.6856, "step": 97400 }, { "epoch": 74.71264367816092, "grad_norm": 2.2279295921325684, "learning_rate": 3.3232758620689653e-06, "loss": 11.7551, "step": 97500 }, { "epoch": 74.78927203065135, "grad_norm": 1.8389006853103638, "learning_rate": 3.275383141762452e-06, "loss": 12.0037, "step": 97600 }, { "epoch": 74.86590038314176, "grad_norm": 1.4288936853408813, "learning_rate": 3.2274904214559387e-06, "loss": 12.0561, "step": 97700 }, { "epoch": 74.94252873563218, "grad_norm": 1.037800669670105, "learning_rate": 3.1795977011494255e-06, "loss": 11.9257, "step": 97800 }, { "epoch": 75.0, "eval_loss": 12.724896430969238, "eval_runtime": 44.1538, "eval_samples_per_second": 29.556, "eval_steps_per_second": 3.714, "step": 97875 }, { "epoch": 75.01915708812261, "grad_norm": 0.9783554673194885, "learning_rate": 3.1317049808429124e-06, "loss": 11.7455, "step": 97900 }, { "epoch": 75.09578544061303, "grad_norm": 1.4434301853179932, "learning_rate": 3.0838122605363985e-06, "loss": 11.99, "step": 98000 }, { "epoch": 75.17241379310344, "grad_norm": 1.2560200691223145, "learning_rate": 3.035919540229885e-06, "loss": 11.8445, "step": 98100 }, { "epoch": 75.24904214559388, "grad_norm": 1.123687982559204, "learning_rate": 2.988026819923372e-06, "loss": 11.8894, "step": 98200 }, { "epoch": 75.32567049808429, "grad_norm": 1.2393250465393066, "learning_rate": 2.9401340996168583e-06, "loss": 11.7591, "step": 98300 }, { "epoch": 75.40229885057471, "grad_norm": 2.023070812225342, "learning_rate": 2.892241379310345e-06, "loss": 11.7083, "step": 98400 }, { "epoch": 75.47892720306514, "grad_norm": 1.7746585607528687, "learning_rate": 2.8443486590038316e-06, "loss": 12.0237, "step": 98500 }, { "epoch": 75.55555555555556, "grad_norm": 1.6215800046920776, "learning_rate": 2.796455938697318e-06, "loss": 11.8271, "step": 98600 }, { "epoch": 75.63218390804597, "grad_norm": 2.3727614879608154, "learning_rate": 2.7490421455938698e-06, "loss": 11.9133, "step": 98700 }, { "epoch": 75.7088122605364, "grad_norm": 1.562569260597229, "learning_rate": 2.7011494252873562e-06, "loss": 11.8886, "step": 98800 }, { "epoch": 75.78544061302682, "grad_norm": 0.8996521830558777, "learning_rate": 2.653256704980843e-06, "loss": 11.6606, "step": 98900 }, { "epoch": 75.86206896551724, "grad_norm": 1.6331411600112915, "learning_rate": 2.6053639846743296e-06, "loss": 12.057, "step": 99000 }, { "epoch": 75.93869731800767, "grad_norm": 1.2690104246139526, "learning_rate": 2.5574712643678165e-06, "loss": 11.9791, "step": 99100 }, { "epoch": 76.0, "eval_loss": 12.717323303222656, "eval_runtime": 44.1546, "eval_samples_per_second": 29.555, "eval_steps_per_second": 3.714, "step": 99180 }, { "epoch": 76.01532567049809, "grad_norm": 1.737823724746704, "learning_rate": 2.509578544061303e-06, "loss": 11.8981, "step": 99200 }, { "epoch": 76.0919540229885, "grad_norm": 1.0878353118896484, "learning_rate": 2.4616858237547894e-06, "loss": 11.8443, "step": 99300 }, { "epoch": 76.16858237547893, "grad_norm": 2.0454564094543457, "learning_rate": 2.413793103448276e-06, "loss": 11.8515, "step": 99400 }, { "epoch": 76.24521072796935, "grad_norm": 1.3210684061050415, "learning_rate": 2.3659003831417623e-06, "loss": 12.0233, "step": 99500 }, { "epoch": 76.32183908045977, "grad_norm": 1.1547104120254517, "learning_rate": 2.318007662835249e-06, "loss": 11.7145, "step": 99600 }, { "epoch": 76.3984674329502, "grad_norm": 1.3948626518249512, "learning_rate": 2.270114942528736e-06, "loss": 11.7098, "step": 99700 }, { "epoch": 76.47509578544062, "grad_norm": 1.2874501943588257, "learning_rate": 2.2222222222222225e-06, "loss": 11.8953, "step": 99800 }, { "epoch": 76.55172413793103, "grad_norm": 1.8570905923843384, "learning_rate": 2.174329501915709e-06, "loss": 11.9397, "step": 99900 }, { "epoch": 76.62835249042146, "grad_norm": 1.3673057556152344, "learning_rate": 2.1264367816091954e-06, "loss": 11.8056, "step": 100000 }, { "epoch": 76.70498084291188, "grad_norm": 2.1938419342041016, "learning_rate": 2.078544061302682e-06, "loss": 11.9414, "step": 100100 }, { "epoch": 76.7816091954023, "grad_norm": 1.9171061515808105, "learning_rate": 2.0306513409961687e-06, "loss": 11.8369, "step": 100200 }, { "epoch": 76.85823754789271, "grad_norm": 1.0486401319503784, "learning_rate": 1.982758620689655e-06, "loss": 11.8322, "step": 100300 }, { "epoch": 76.93486590038314, "grad_norm": 1.6005215644836426, "learning_rate": 1.934865900383142e-06, "loss": 11.8781, "step": 100400 }, { "epoch": 77.0, "eval_loss": 12.72097396850586, "eval_runtime": 44.1751, "eval_samples_per_second": 29.542, "eval_steps_per_second": 3.712, "step": 100485 } ], "logging_steps": 100, "max_steps": 104400, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 9 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.681650983960218e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }