|
{ |
|
"best_metric": 12.716951370239258, |
|
"best_model_checkpoint": "/kaggle/working/output/checkpoint-88740", |
|
"epoch": 77.0, |
|
"eval_steps": 500, |
|
"global_step": 100485, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07662835249042145, |
|
"grad_norm": 8.174947738647461, |
|
"learning_rate": 4.9952586206896554e-05, |
|
"loss": 96.5258, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1532567049808429, |
|
"grad_norm": 8.584559440612793, |
|
"learning_rate": 4.990469348659004e-05, |
|
"loss": 48.0822, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22988505747126436, |
|
"grad_norm": 8.02587604522705, |
|
"learning_rate": 4.985680076628353e-05, |
|
"loss": 31.9469, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3065134099616858, |
|
"grad_norm": 6.968703746795654, |
|
"learning_rate": 4.9808908045977015e-05, |
|
"loss": 24.973, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3831417624521073, |
|
"grad_norm": 6.017839431762695, |
|
"learning_rate": 4.97610153256705e-05, |
|
"loss": 20.7473, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 4.75618839263916, |
|
"learning_rate": 4.971312260536399e-05, |
|
"loss": 18.6219, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5363984674329502, |
|
"grad_norm": 3.5624868869781494, |
|
"learning_rate": 4.9665229885057475e-05, |
|
"loss": 17.1775, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6130268199233716, |
|
"grad_norm": 2.889848470687866, |
|
"learning_rate": 4.961733716475096e-05, |
|
"loss": 16.1131, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 8.15518856048584, |
|
"learning_rate": 4.956944444444445e-05, |
|
"loss": 15.8697, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7662835249042146, |
|
"grad_norm": 3.092848539352417, |
|
"learning_rate": 4.952155172413793e-05, |
|
"loss": 15.5523, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.842911877394636, |
|
"grad_norm": 2.181015968322754, |
|
"learning_rate": 4.9473659003831416e-05, |
|
"loss": 15.5628, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 2.1515514850616455, |
|
"learning_rate": 4.94257662835249e-05, |
|
"loss": 15.3004, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9961685823754789, |
|
"grad_norm": 1.476803183555603, |
|
"learning_rate": 4.937787356321839e-05, |
|
"loss": 15.3448, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 15.641121864318848, |
|
"eval_runtime": 44.0061, |
|
"eval_samples_per_second": 29.655, |
|
"eval_steps_per_second": 3.727, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.0727969348659003, |
|
"grad_norm": 3.050917863845825, |
|
"learning_rate": 4.932998084291188e-05, |
|
"loss": 14.901, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1494252873563218, |
|
"grad_norm": 1.6784011125564575, |
|
"learning_rate": 4.928208812260537e-05, |
|
"loss": 14.7073, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2260536398467432, |
|
"grad_norm": 3.2630977630615234, |
|
"learning_rate": 4.923419540229886e-05, |
|
"loss": 14.9142, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3026819923371646, |
|
"grad_norm": 1.6106696128845215, |
|
"learning_rate": 4.9186302681992344e-05, |
|
"loss": 14.9731, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 4.378266334533691, |
|
"learning_rate": 4.9138409961685824e-05, |
|
"loss": 14.5922, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.4559386973180077, |
|
"grad_norm": 2.196368455886841, |
|
"learning_rate": 4.909051724137931e-05, |
|
"loss": 15.024, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.5325670498084292, |
|
"grad_norm": 1.1820286512374878, |
|
"learning_rate": 4.90426245210728e-05, |
|
"loss": 14.6291, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.6091954022988506, |
|
"grad_norm": 2.6733219623565674, |
|
"learning_rate": 4.8994731800766285e-05, |
|
"loss": 15.1916, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.685823754789272, |
|
"grad_norm": 2.461630344390869, |
|
"learning_rate": 4.894683908045977e-05, |
|
"loss": 14.7438, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.7624521072796935, |
|
"grad_norm": 1.7039703130722046, |
|
"learning_rate": 4.889894636015326e-05, |
|
"loss": 14.3014, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"grad_norm": 2.291198253631592, |
|
"learning_rate": 4.8851053639846746e-05, |
|
"loss": 14.5648, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9157088122605364, |
|
"grad_norm": 2.088695764541626, |
|
"learning_rate": 4.880316091954023e-05, |
|
"loss": 14.2778, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.9923371647509578, |
|
"grad_norm": 1.9745572805404663, |
|
"learning_rate": 4.875526819923372e-05, |
|
"loss": 14.612, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 14.947260856628418, |
|
"eval_runtime": 44.059, |
|
"eval_samples_per_second": 29.619, |
|
"eval_steps_per_second": 3.722, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 3.296757698059082, |
|
"learning_rate": 4.8707375478927206e-05, |
|
"loss": 14.4268, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.1455938697318007, |
|
"grad_norm": 1.2265104055404663, |
|
"learning_rate": 4.865948275862069e-05, |
|
"loss": 14.3716, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 2.3575916290283203, |
|
"learning_rate": 4.861159003831418e-05, |
|
"loss": 14.2911, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.2988505747126435, |
|
"grad_norm": 1.535346508026123, |
|
"learning_rate": 4.856369731800767e-05, |
|
"loss": 14.0469, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.375478927203065, |
|
"grad_norm": 2.3857269287109375, |
|
"learning_rate": 4.8515804597701154e-05, |
|
"loss": 14.0246, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.4521072796934864, |
|
"grad_norm": 1.46570885181427, |
|
"learning_rate": 4.846791187739464e-05, |
|
"loss": 14.0864, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.528735632183908, |
|
"grad_norm": 1.3398170471191406, |
|
"learning_rate": 4.842001915708813e-05, |
|
"loss": 14.1075, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.6053639846743293, |
|
"grad_norm": 1.4247232675552368, |
|
"learning_rate": 4.8372126436781614e-05, |
|
"loss": 13.9681, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.681992337164751, |
|
"grad_norm": 1.602295160293579, |
|
"learning_rate": 4.83242337164751e-05, |
|
"loss": 14.0847, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 1.8135626316070557, |
|
"learning_rate": 4.827634099616858e-05, |
|
"loss": 13.9871, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.835249042145594, |
|
"grad_norm": 2.3612937927246094, |
|
"learning_rate": 4.822844827586207e-05, |
|
"loss": 14.043, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.9118773946360155, |
|
"grad_norm": 2.1295549869537354, |
|
"learning_rate": 4.8180555555555555e-05, |
|
"loss": 14.0695, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.9885057471264367, |
|
"grad_norm": 2.768362283706665, |
|
"learning_rate": 4.813266283524904e-05, |
|
"loss": 13.8804, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 14.543105125427246, |
|
"eval_runtime": 44.0531, |
|
"eval_samples_per_second": 29.623, |
|
"eval_steps_per_second": 3.723, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 3.0651340996168583, |
|
"grad_norm": 2.190544366836548, |
|
"learning_rate": 4.808477011494253e-05, |
|
"loss": 13.8831, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.1417624521072796, |
|
"grad_norm": 1.6555811166763306, |
|
"learning_rate": 4.8036877394636016e-05, |
|
"loss": 13.661, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.218390804597701, |
|
"grad_norm": 1.1204612255096436, |
|
"learning_rate": 4.798898467432951e-05, |
|
"loss": 13.9753, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.2950191570881224, |
|
"grad_norm": 2.3801109790802, |
|
"learning_rate": 4.7941091954022996e-05, |
|
"loss": 13.9332, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.371647509578544, |
|
"grad_norm": 1.314393162727356, |
|
"learning_rate": 4.7893199233716476e-05, |
|
"loss": 13.8442, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 2.0551559925079346, |
|
"learning_rate": 4.784530651340996e-05, |
|
"loss": 13.5678, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.524904214559387, |
|
"grad_norm": 1.4303470849990845, |
|
"learning_rate": 4.779741379310345e-05, |
|
"loss": 13.7754, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.6015325670498086, |
|
"grad_norm": 2.2181780338287354, |
|
"learning_rate": 4.774952107279694e-05, |
|
"loss": 13.5568, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.67816091954023, |
|
"grad_norm": 1.377549648284912, |
|
"learning_rate": 4.7701628352490424e-05, |
|
"loss": 13.4359, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.7547892720306515, |
|
"grad_norm": 1.6644877195358276, |
|
"learning_rate": 4.765373563218391e-05, |
|
"loss": 13.6701, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.8314176245210727, |
|
"grad_norm": 1.6416462659835815, |
|
"learning_rate": 4.76058429118774e-05, |
|
"loss": 13.6427, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.9080459770114944, |
|
"grad_norm": 1.5726954936981201, |
|
"learning_rate": 4.7557950191570885e-05, |
|
"loss": 13.6802, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.9846743295019156, |
|
"grad_norm": 1.3120722770690918, |
|
"learning_rate": 4.751005747126437e-05, |
|
"loss": 13.6631, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 14.28848934173584, |
|
"eval_runtime": 44.0456, |
|
"eval_samples_per_second": 29.628, |
|
"eval_steps_per_second": 3.723, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 4.061302681992337, |
|
"grad_norm": 1.9124590158462524, |
|
"learning_rate": 4.746216475095785e-05, |
|
"loss": 13.5388, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 1.3689558506011963, |
|
"learning_rate": 4.741427203065134e-05, |
|
"loss": 13.5553, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.21455938697318, |
|
"grad_norm": 1.6370700597763062, |
|
"learning_rate": 4.7366379310344825e-05, |
|
"loss": 13.5781, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.291187739463601, |
|
"grad_norm": 1.993304967880249, |
|
"learning_rate": 4.731848659003832e-05, |
|
"loss": 13.5261, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.3678160919540225, |
|
"grad_norm": 2.3975770473480225, |
|
"learning_rate": 4.7270593869731806e-05, |
|
"loss": 13.4305, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 1.9231036901474, |
|
"learning_rate": 4.722270114942529e-05, |
|
"loss": 13.3994, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.521072796934866, |
|
"grad_norm": 1.0928981304168701, |
|
"learning_rate": 4.717480842911878e-05, |
|
"loss": 13.3212, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 4.597701149425287, |
|
"grad_norm": 1.3092130422592163, |
|
"learning_rate": 4.7126915708812266e-05, |
|
"loss": 13.4476, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.674329501915709, |
|
"grad_norm": 2.0151021480560303, |
|
"learning_rate": 4.7079022988505747e-05, |
|
"loss": 13.1863, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 4.75095785440613, |
|
"grad_norm": 1.2778387069702148, |
|
"learning_rate": 4.7031130268199233e-05, |
|
"loss": 13.3661, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.827586206896552, |
|
"grad_norm": 1.1671264171600342, |
|
"learning_rate": 4.698371647509579e-05, |
|
"loss": 13.3803, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.904214559386973, |
|
"grad_norm": 0.9788312911987305, |
|
"learning_rate": 4.693582375478928e-05, |
|
"loss": 13.495, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 4.980842911877395, |
|
"grad_norm": 3.2978639602661133, |
|
"learning_rate": 4.6887931034482766e-05, |
|
"loss": 13.4834, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 14.041104316711426, |
|
"eval_runtime": 43.9982, |
|
"eval_samples_per_second": 29.66, |
|
"eval_steps_per_second": 3.727, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 5.057471264367816, |
|
"grad_norm": 1.6198067665100098, |
|
"learning_rate": 4.6840038314176246e-05, |
|
"loss": 13.1646, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 5.134099616858237, |
|
"grad_norm": 5.732328414916992, |
|
"learning_rate": 4.679214559386973e-05, |
|
"loss": 13.4168, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 5.210727969348659, |
|
"grad_norm": 1.518420934677124, |
|
"learning_rate": 4.674425287356322e-05, |
|
"loss": 13.2907, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 5.287356321839081, |
|
"grad_norm": 1.6062932014465332, |
|
"learning_rate": 4.6696360153256706e-05, |
|
"loss": 13.406, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.363984674329502, |
|
"grad_norm": 2.5659947395324707, |
|
"learning_rate": 4.664846743295019e-05, |
|
"loss": 13.252, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 5.440613026819923, |
|
"grad_norm": 1.4965115785598755, |
|
"learning_rate": 4.660057471264368e-05, |
|
"loss": 13.2683, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 2.3210604190826416, |
|
"learning_rate": 4.655268199233717e-05, |
|
"loss": 13.1846, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 5.593869731800766, |
|
"grad_norm": 1.508138656616211, |
|
"learning_rate": 4.6504789272030654e-05, |
|
"loss": 13.1303, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 5.670498084291188, |
|
"grad_norm": 1.2769402265548706, |
|
"learning_rate": 4.645689655172414e-05, |
|
"loss": 13.1109, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 5.747126436781609, |
|
"grad_norm": 3.0062999725341797, |
|
"learning_rate": 4.640900383141763e-05, |
|
"loss": 13.1859, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 5.823754789272031, |
|
"grad_norm": 1.4893639087677002, |
|
"learning_rate": 4.636111111111111e-05, |
|
"loss": 13.2236, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 5.900383141762452, |
|
"grad_norm": 1.9955596923828125, |
|
"learning_rate": 4.63132183908046e-05, |
|
"loss": 13.2806, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 5.977011494252873, |
|
"grad_norm": 1.733920931816101, |
|
"learning_rate": 4.626532567049809e-05, |
|
"loss": 12.9426, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 13.950128555297852, |
|
"eval_runtime": 44.0078, |
|
"eval_samples_per_second": 29.654, |
|
"eval_steps_per_second": 3.727, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 6.053639846743295, |
|
"grad_norm": 1.3697247505187988, |
|
"learning_rate": 4.6217432950191575e-05, |
|
"loss": 13.001, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 6.130268199233717, |
|
"grad_norm": 1.7222646474838257, |
|
"learning_rate": 4.616954022988506e-05, |
|
"loss": 13.1098, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.206896551724138, |
|
"grad_norm": 1.5488767623901367, |
|
"learning_rate": 4.612164750957855e-05, |
|
"loss": 13.2406, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 6.283524904214559, |
|
"grad_norm": 1.1356619596481323, |
|
"learning_rate": 4.6073754789272036e-05, |
|
"loss": 13.0969, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 6.360153256704981, |
|
"grad_norm": 2.161534547805786, |
|
"learning_rate": 4.602586206896552e-05, |
|
"loss": 12.8021, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 6.436781609195402, |
|
"grad_norm": 1.42888605594635, |
|
"learning_rate": 4.5977969348659e-05, |
|
"loss": 13.007, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 6.513409961685824, |
|
"grad_norm": 1.5181623697280884, |
|
"learning_rate": 4.593007662835249e-05, |
|
"loss": 13.2494, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 6.590038314176245, |
|
"grad_norm": 2.6794161796569824, |
|
"learning_rate": 4.588218390804598e-05, |
|
"loss": 13.0472, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 1.3213189840316772, |
|
"learning_rate": 4.5834291187739464e-05, |
|
"loss": 12.7648, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 6.743295019157088, |
|
"grad_norm": 1.1679490804672241, |
|
"learning_rate": 4.578639846743295e-05, |
|
"loss": 13.0907, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 6.819923371647509, |
|
"grad_norm": 1.7697467803955078, |
|
"learning_rate": 4.573850574712644e-05, |
|
"loss": 12.8777, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 1.7574371099472046, |
|
"learning_rate": 4.5690613026819924e-05, |
|
"loss": 12.8949, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 6.973180076628353, |
|
"grad_norm": 1.8508405685424805, |
|
"learning_rate": 4.564272030651342e-05, |
|
"loss": 13.0364, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 13.742591857910156, |
|
"eval_runtime": 44.1082, |
|
"eval_samples_per_second": 29.586, |
|
"eval_steps_per_second": 3.718, |
|
"step": 9135 |
|
}, |
|
{ |
|
"epoch": 7.049808429118774, |
|
"grad_norm": 1.304430365562439, |
|
"learning_rate": 4.55948275862069e-05, |
|
"loss": 13.1197, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 7.126436781609195, |
|
"grad_norm": 1.112478256225586, |
|
"learning_rate": 4.5546934865900385e-05, |
|
"loss": 13.072, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 7.203065134099617, |
|
"grad_norm": 1.6277681589126587, |
|
"learning_rate": 4.5499521072796937e-05, |
|
"loss": 12.8787, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 7.2796934865900385, |
|
"grad_norm": 1.6854459047317505, |
|
"learning_rate": 4.5451628352490423e-05, |
|
"loss": 12.9961, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 7.35632183908046, |
|
"grad_norm": 1.5988355875015259, |
|
"learning_rate": 4.540373563218391e-05, |
|
"loss": 12.9588, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 7.432950191570881, |
|
"grad_norm": 1.0676491260528564, |
|
"learning_rate": 4.53558429118774e-05, |
|
"loss": 12.8359, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 7.509578544061303, |
|
"grad_norm": 1.8556437492370605, |
|
"learning_rate": 4.5307950191570884e-05, |
|
"loss": 12.813, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 7.586206896551724, |
|
"grad_norm": 1.5877550840377808, |
|
"learning_rate": 4.526005747126437e-05, |
|
"loss": 12.9205, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 7.662835249042145, |
|
"grad_norm": 1.2095483541488647, |
|
"learning_rate": 4.521216475095786e-05, |
|
"loss": 12.9472, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 7.739463601532567, |
|
"grad_norm": 3.998228073120117, |
|
"learning_rate": 4.5164272030651345e-05, |
|
"loss": 12.871, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 7.816091954022989, |
|
"grad_norm": 1.4408106803894043, |
|
"learning_rate": 4.511637931034483e-05, |
|
"loss": 12.9723, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 7.89272030651341, |
|
"grad_norm": 0.9685239791870117, |
|
"learning_rate": 4.506848659003832e-05, |
|
"loss": 12.7816, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 7.969348659003831, |
|
"grad_norm": 2.4164698123931885, |
|
"learning_rate": 4.5020593869731805e-05, |
|
"loss": 12.8656, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 13.643902778625488, |
|
"eval_runtime": 44.1312, |
|
"eval_samples_per_second": 29.571, |
|
"eval_steps_per_second": 3.716, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 8.045977011494253, |
|
"grad_norm": 1.4973284006118774, |
|
"learning_rate": 4.497270114942529e-05, |
|
"loss": 12.9654, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 8.122605363984674, |
|
"grad_norm": 1.9837547540664673, |
|
"learning_rate": 4.492480842911877e-05, |
|
"loss": 12.9358, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 8.199233716475096, |
|
"grad_norm": 2.1501142978668213, |
|
"learning_rate": 4.487691570881226e-05, |
|
"loss": 12.9226, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 1.959155797958374, |
|
"learning_rate": 4.4829022988505746e-05, |
|
"loss": 12.8136, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 8.352490421455938, |
|
"grad_norm": 1.7081148624420166, |
|
"learning_rate": 4.478113026819923e-05, |
|
"loss": 12.6215, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 8.42911877394636, |
|
"grad_norm": 3.0818092823028564, |
|
"learning_rate": 4.473323754789272e-05, |
|
"loss": 12.7263, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 8.505747126436782, |
|
"grad_norm": 1.2609460353851318, |
|
"learning_rate": 4.468534482758621e-05, |
|
"loss": 12.615, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 8.582375478927203, |
|
"grad_norm": 1.1553901433944702, |
|
"learning_rate": 4.46374521072797e-05, |
|
"loss": 12.9115, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 8.659003831417625, |
|
"grad_norm": 2.876321792602539, |
|
"learning_rate": 4.458955938697319e-05, |
|
"loss": 12.8372, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 8.735632183908045, |
|
"grad_norm": 2.3537096977233887, |
|
"learning_rate": 4.454166666666667e-05, |
|
"loss": 12.8684, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 8.812260536398467, |
|
"grad_norm": 1.4264323711395264, |
|
"learning_rate": 4.4493773946360154e-05, |
|
"loss": 12.6151, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 1.8997728824615479, |
|
"learning_rate": 4.4446360153256706e-05, |
|
"loss": 12.8187, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 8.96551724137931, |
|
"grad_norm": 1.8338580131530762, |
|
"learning_rate": 4.439846743295019e-05, |
|
"loss": 12.7365, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 13.53819465637207, |
|
"eval_runtime": 44.0314, |
|
"eval_samples_per_second": 29.638, |
|
"eval_steps_per_second": 3.725, |
|
"step": 11745 |
|
}, |
|
{ |
|
"epoch": 9.042145593869732, |
|
"grad_norm": 12.737005233764648, |
|
"learning_rate": 4.4351053639846745e-05, |
|
"loss": 12.8002, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 9.118773946360154, |
|
"grad_norm": 1.8820631504058838, |
|
"learning_rate": 4.430316091954023e-05, |
|
"loss": 12.8415, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 9.195402298850574, |
|
"grad_norm": 1.5012093782424927, |
|
"learning_rate": 4.425526819923372e-05, |
|
"loss": 12.8011, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 9.272030651340996, |
|
"grad_norm": 2.5062639713287354, |
|
"learning_rate": 4.4207375478927205e-05, |
|
"loss": 12.7156, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 9.348659003831418, |
|
"grad_norm": 1.5295358896255493, |
|
"learning_rate": 4.415948275862069e-05, |
|
"loss": 12.8449, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 9.425287356321839, |
|
"grad_norm": 1.6232823133468628, |
|
"learning_rate": 4.411159003831418e-05, |
|
"loss": 12.7345, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 9.50191570881226, |
|
"grad_norm": 1.4783318042755127, |
|
"learning_rate": 4.4063697318007666e-05, |
|
"loss": 12.7392, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 9.578544061302683, |
|
"grad_norm": 1.7494572401046753, |
|
"learning_rate": 4.4015804597701146e-05, |
|
"loss": 12.6017, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 2.065991163253784, |
|
"learning_rate": 4.396791187739464e-05, |
|
"loss": 12.695, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 9.731800766283525, |
|
"grad_norm": 1.2360838651657104, |
|
"learning_rate": 4.3920019157088127e-05, |
|
"loss": 12.7994, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 9.808429118773946, |
|
"grad_norm": 2.084902048110962, |
|
"learning_rate": 4.3872126436781613e-05, |
|
"loss": 12.6864, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 9.885057471264368, |
|
"grad_norm": 1.4381409883499146, |
|
"learning_rate": 4.38242337164751e-05, |
|
"loss": 12.6875, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 9.96168582375479, |
|
"grad_norm": 1.5936471223831177, |
|
"learning_rate": 4.377634099616859e-05, |
|
"loss": 12.6413, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 13.456477165222168, |
|
"eval_runtime": 44.0741, |
|
"eval_samples_per_second": 29.609, |
|
"eval_steps_per_second": 3.721, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 10.03831417624521, |
|
"grad_norm": 1.1829323768615723, |
|
"learning_rate": 4.3728448275862074e-05, |
|
"loss": 12.7182, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 10.114942528735632, |
|
"grad_norm": 1.7679022550582886, |
|
"learning_rate": 4.368055555555556e-05, |
|
"loss": 12.7508, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 10.191570881226054, |
|
"grad_norm": 2.4053192138671875, |
|
"learning_rate": 4.363266283524904e-05, |
|
"loss": 12.5668, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 10.268199233716475, |
|
"grad_norm": 2.4858756065368652, |
|
"learning_rate": 4.358477011494253e-05, |
|
"loss": 12.6561, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 10.344827586206897, |
|
"grad_norm": 2.138453483581543, |
|
"learning_rate": 4.3536877394636015e-05, |
|
"loss": 12.6829, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 10.421455938697317, |
|
"grad_norm": 1.490075707435608, |
|
"learning_rate": 4.34889846743295e-05, |
|
"loss": 12.7284, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 10.49808429118774, |
|
"grad_norm": 3.1338703632354736, |
|
"learning_rate": 4.344109195402299e-05, |
|
"loss": 12.5722, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 10.574712643678161, |
|
"grad_norm": 1.844388723373413, |
|
"learning_rate": 4.3393199233716475e-05, |
|
"loss": 12.8212, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 10.651340996168582, |
|
"grad_norm": 1.9379137754440308, |
|
"learning_rate": 4.334530651340996e-05, |
|
"loss": 12.368, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 10.727969348659004, |
|
"grad_norm": 4.608842849731445, |
|
"learning_rate": 4.3297413793103456e-05, |
|
"loss": 12.3258, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 10.804597701149426, |
|
"grad_norm": 1.607155680656433, |
|
"learning_rate": 4.325e-05, |
|
"loss": 12.8355, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 10.881226053639846, |
|
"grad_norm": 1.7595943212509155, |
|
"learning_rate": 4.320210727969349e-05, |
|
"loss": 12.6135, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 10.957854406130268, |
|
"grad_norm": 1.7879704236984253, |
|
"learning_rate": 4.3154214559386975e-05, |
|
"loss": 12.7107, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 13.364398002624512, |
|
"eval_runtime": 44.0273, |
|
"eval_samples_per_second": 29.641, |
|
"eval_steps_per_second": 3.725, |
|
"step": 14355 |
|
}, |
|
{ |
|
"epoch": 11.03448275862069, |
|
"grad_norm": 3.187349557876587, |
|
"learning_rate": 4.310632183908046e-05, |
|
"loss": 12.7471, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 11.11111111111111, |
|
"grad_norm": 3.118311643600464, |
|
"learning_rate": 4.305842911877395e-05, |
|
"loss": 12.4422, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 11.187739463601533, |
|
"grad_norm": 2.276580333709717, |
|
"learning_rate": 4.3010536398467435e-05, |
|
"loss": 12.5443, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 11.264367816091955, |
|
"grad_norm": 1.3369340896606445, |
|
"learning_rate": 4.296264367816092e-05, |
|
"loss": 12.7497, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 11.340996168582375, |
|
"grad_norm": 1.2438215017318726, |
|
"learning_rate": 4.291475095785441e-05, |
|
"loss": 12.6343, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 11.417624521072797, |
|
"grad_norm": 1.668867826461792, |
|
"learning_rate": 4.2866858237547896e-05, |
|
"loss": 12.673, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 11.494252873563218, |
|
"grad_norm": 2.550316572189331, |
|
"learning_rate": 4.281896551724138e-05, |
|
"loss": 12.7346, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 11.57088122605364, |
|
"grad_norm": 1.3926326036453247, |
|
"learning_rate": 4.277107279693487e-05, |
|
"loss": 12.5431, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 11.647509578544062, |
|
"grad_norm": 1.3561134338378906, |
|
"learning_rate": 4.272318007662836e-05, |
|
"loss": 12.4943, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 11.724137931034482, |
|
"grad_norm": 1.4978444576263428, |
|
"learning_rate": 4.2675287356321844e-05, |
|
"loss": 12.4103, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 11.800766283524904, |
|
"grad_norm": 1.8163210153579712, |
|
"learning_rate": 4.262739463601533e-05, |
|
"loss": 12.5454, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 11.877394636015326, |
|
"grad_norm": 1.3819987773895264, |
|
"learning_rate": 4.257950191570881e-05, |
|
"loss": 12.5219, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 11.954022988505747, |
|
"grad_norm": 1.6237196922302246, |
|
"learning_rate": 4.25316091954023e-05, |
|
"loss": 12.5876, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 13.39963436126709, |
|
"eval_runtime": 44.002, |
|
"eval_samples_per_second": 29.658, |
|
"eval_steps_per_second": 3.727, |
|
"step": 15660 |
|
}, |
|
{ |
|
"epoch": 12.030651340996169, |
|
"grad_norm": 1.1271090507507324, |
|
"learning_rate": 4.2483716475095784e-05, |
|
"loss": 12.3581, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 12.10727969348659, |
|
"grad_norm": 1.5027310848236084, |
|
"learning_rate": 4.243582375478927e-05, |
|
"loss": 12.5517, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 12.183908045977011, |
|
"grad_norm": 1.5543391704559326, |
|
"learning_rate": 4.238793103448276e-05, |
|
"loss": 12.7011, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 12.260536398467433, |
|
"grad_norm": 1.7037404775619507, |
|
"learning_rate": 4.2340038314176245e-05, |
|
"loss": 12.289, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 12.337164750957854, |
|
"grad_norm": 4.505245208740234, |
|
"learning_rate": 4.229214559386974e-05, |
|
"loss": 12.3584, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 12.413793103448276, |
|
"grad_norm": 1.5144113302230835, |
|
"learning_rate": 4.2244252873563225e-05, |
|
"loss": 12.4209, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 12.490421455938698, |
|
"grad_norm": 1.2396819591522217, |
|
"learning_rate": 4.2196360153256706e-05, |
|
"loss": 12.4463, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 12.567049808429118, |
|
"grad_norm": 5.947683334350586, |
|
"learning_rate": 4.214846743295019e-05, |
|
"loss": 12.6401, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 12.64367816091954, |
|
"grad_norm": 2.070812225341797, |
|
"learning_rate": 4.210057471264368e-05, |
|
"loss": 12.6885, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 12.720306513409962, |
|
"grad_norm": 1.7540252208709717, |
|
"learning_rate": 4.2052681992337166e-05, |
|
"loss": 12.3138, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 12.796934865900383, |
|
"grad_norm": 1.3372827768325806, |
|
"learning_rate": 4.200478927203065e-05, |
|
"loss": 12.8475, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 12.873563218390805, |
|
"grad_norm": 1.6598443984985352, |
|
"learning_rate": 4.195689655172414e-05, |
|
"loss": 12.575, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 12.950191570881227, |
|
"grad_norm": 1.5420461893081665, |
|
"learning_rate": 4.190900383141763e-05, |
|
"loss": 12.499, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 13.359596252441406, |
|
"eval_runtime": 43.9919, |
|
"eval_samples_per_second": 29.665, |
|
"eval_steps_per_second": 3.728, |
|
"step": 16965 |
|
}, |
|
{ |
|
"epoch": 13.026819923371647, |
|
"grad_norm": 1.785803198814392, |
|
"learning_rate": 4.1861111111111114e-05, |
|
"loss": 12.3123, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 13.10344827586207, |
|
"grad_norm": 3.8619072437286377, |
|
"learning_rate": 4.1813697318007665e-05, |
|
"loss": 12.4633, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 13.18007662835249, |
|
"grad_norm": 1.2189018726348877, |
|
"learning_rate": 4.176580459770115e-05, |
|
"loss": 12.4732, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 13.256704980842912, |
|
"grad_norm": 3.579725742340088, |
|
"learning_rate": 4.171791187739464e-05, |
|
"loss": 12.3486, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 1.258268117904663, |
|
"learning_rate": 4.1670019157088126e-05, |
|
"loss": 12.5506, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 13.409961685823754, |
|
"grad_norm": 1.6867891550064087, |
|
"learning_rate": 4.162212643678161e-05, |
|
"loss": 12.5667, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 13.486590038314176, |
|
"grad_norm": 1.5345897674560547, |
|
"learning_rate": 4.15742337164751e-05, |
|
"loss": 12.5206, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 13.563218390804598, |
|
"grad_norm": 1.1699010133743286, |
|
"learning_rate": 4.152634099616859e-05, |
|
"loss": 12.3728, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 13.639846743295019, |
|
"grad_norm": 1.669938325881958, |
|
"learning_rate": 4.147844827586207e-05, |
|
"loss": 12.4601, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 13.71647509578544, |
|
"grad_norm": 1.2530852556228638, |
|
"learning_rate": 4.1430555555555554e-05, |
|
"loss": 12.4501, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 13.793103448275861, |
|
"grad_norm": 1.790138840675354, |
|
"learning_rate": 4.138266283524904e-05, |
|
"loss": 12.467, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 13.869731800766283, |
|
"grad_norm": 1.3373574018478394, |
|
"learning_rate": 4.133477011494253e-05, |
|
"loss": 12.4602, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 13.946360153256705, |
|
"grad_norm": 1.837951898574829, |
|
"learning_rate": 4.128687739463602e-05, |
|
"loss": 12.4591, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 13.289255142211914, |
|
"eval_runtime": 43.9866, |
|
"eval_samples_per_second": 29.668, |
|
"eval_steps_per_second": 3.728, |
|
"step": 18270 |
|
}, |
|
{ |
|
"epoch": 14.022988505747126, |
|
"grad_norm": 1.540867805480957, |
|
"learning_rate": 4.123898467432951e-05, |
|
"loss": 12.59, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 14.099616858237548, |
|
"grad_norm": 1.6285018920898438, |
|
"learning_rate": 4.1191091954022995e-05, |
|
"loss": 12.5162, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 14.17624521072797, |
|
"grad_norm": 0.8983919620513916, |
|
"learning_rate": 4.114319923371648e-05, |
|
"loss": 12.4312, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 14.25287356321839, |
|
"grad_norm": 1.7475948333740234, |
|
"learning_rate": 4.109530651340996e-05, |
|
"loss": 12.483, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 14.329501915708812, |
|
"grad_norm": 1.723708987236023, |
|
"learning_rate": 4.104741379310345e-05, |
|
"loss": 12.5177, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 14.406130268199234, |
|
"grad_norm": 1.3113809823989868, |
|
"learning_rate": 4.0999521072796936e-05, |
|
"loss": 12.3171, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 14.482758620689655, |
|
"grad_norm": 1.7641185522079468, |
|
"learning_rate": 4.095162835249042e-05, |
|
"loss": 12.4669, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 14.559386973180077, |
|
"grad_norm": 1.6181635856628418, |
|
"learning_rate": 4.090373563218391e-05, |
|
"loss": 12.3302, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 14.636015325670499, |
|
"grad_norm": 1.2323795557022095, |
|
"learning_rate": 4.0855842911877396e-05, |
|
"loss": 12.4211, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 14.71264367816092, |
|
"grad_norm": 1.7597166299819946, |
|
"learning_rate": 4.080795019157088e-05, |
|
"loss": 12.4985, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 14.789272030651341, |
|
"grad_norm": 1.0281277894973755, |
|
"learning_rate": 4.076005747126437e-05, |
|
"loss": 12.5672, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 14.865900383141762, |
|
"grad_norm": 3.3272478580474854, |
|
"learning_rate": 4.071216475095786e-05, |
|
"loss": 12.2671, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 14.942528735632184, |
|
"grad_norm": 3.1264896392822266, |
|
"learning_rate": 4.066427203065134e-05, |
|
"loss": 12.4736, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 13.205364227294922, |
|
"eval_runtime": 43.9612, |
|
"eval_samples_per_second": 29.685, |
|
"eval_steps_per_second": 3.731, |
|
"step": 19575 |
|
}, |
|
{ |
|
"epoch": 15.019157088122606, |
|
"grad_norm": 1.568294882774353, |
|
"learning_rate": 4.061637931034483e-05, |
|
"loss": 12.4604, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 15.095785440613026, |
|
"grad_norm": 1.919912576675415, |
|
"learning_rate": 4.056848659003832e-05, |
|
"loss": 12.3773, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 15.172413793103448, |
|
"grad_norm": 1.5357537269592285, |
|
"learning_rate": 4.0520593869731804e-05, |
|
"loss": 12.3406, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 15.24904214559387, |
|
"grad_norm": 1.7306512594223022, |
|
"learning_rate": 4.0473180076628356e-05, |
|
"loss": 12.4036, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 15.32567049808429, |
|
"grad_norm": 1.6036773920059204, |
|
"learning_rate": 4.0425287356321836e-05, |
|
"loss": 12.3554, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 15.402298850574713, |
|
"grad_norm": 1.211962342262268, |
|
"learning_rate": 4.037739463601532e-05, |
|
"loss": 12.5084, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 15.478927203065133, |
|
"grad_norm": 1.4626506567001343, |
|
"learning_rate": 4.032950191570881e-05, |
|
"loss": 12.3593, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 15.555555555555555, |
|
"grad_norm": 1.6557157039642334, |
|
"learning_rate": 4.0281609195402304e-05, |
|
"loss": 12.3249, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 15.632183908045977, |
|
"grad_norm": 1.735300064086914, |
|
"learning_rate": 4.023371647509579e-05, |
|
"loss": 12.2958, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 15.708812260536398, |
|
"grad_norm": 1.2972387075424194, |
|
"learning_rate": 4.018582375478928e-05, |
|
"loss": 12.4011, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 15.78544061302682, |
|
"grad_norm": 1.2028956413269043, |
|
"learning_rate": 4.0137931034482764e-05, |
|
"loss": 12.3923, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 15.862068965517242, |
|
"grad_norm": 1.9574451446533203, |
|
"learning_rate": 4.009003831417625e-05, |
|
"loss": 12.4927, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 15.938697318007662, |
|
"grad_norm": 2.3753159046173096, |
|
"learning_rate": 4.004214559386973e-05, |
|
"loss": 12.4565, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 13.146517753601074, |
|
"eval_runtime": 43.956, |
|
"eval_samples_per_second": 29.689, |
|
"eval_steps_per_second": 3.731, |
|
"step": 20880 |
|
}, |
|
{ |
|
"epoch": 16.015325670498083, |
|
"grad_norm": 1.4980436563491821, |
|
"learning_rate": 3.999425287356322e-05, |
|
"loss": 12.4546, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 16.091954022988507, |
|
"grad_norm": 1.2177377939224243, |
|
"learning_rate": 3.9946360153256705e-05, |
|
"loss": 12.3682, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 16.168582375478927, |
|
"grad_norm": 1.9785245656967163, |
|
"learning_rate": 3.989846743295019e-05, |
|
"loss": 12.4315, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 16.245210727969347, |
|
"grad_norm": 2.2773125171661377, |
|
"learning_rate": 3.985057471264368e-05, |
|
"loss": 12.4728, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 16.32183908045977, |
|
"grad_norm": 1.1049697399139404, |
|
"learning_rate": 3.9802681992337166e-05, |
|
"loss": 12.0735, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 16.39846743295019, |
|
"grad_norm": 2.937175750732422, |
|
"learning_rate": 3.975478927203065e-05, |
|
"loss": 12.4713, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 16.47509578544061, |
|
"grad_norm": 1.058626651763916, |
|
"learning_rate": 3.970689655172414e-05, |
|
"loss": 12.3329, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 16.551724137931036, |
|
"grad_norm": 2.357311248779297, |
|
"learning_rate": 3.9659003831417626e-05, |
|
"loss": 12.2249, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 16.628352490421456, |
|
"grad_norm": 1.0534141063690186, |
|
"learning_rate": 3.961111111111111e-05, |
|
"loss": 12.4414, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 16.704980842911876, |
|
"grad_norm": 1.5288047790527344, |
|
"learning_rate": 3.95632183908046e-05, |
|
"loss": 12.0682, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 16.7816091954023, |
|
"grad_norm": 2.628070831298828, |
|
"learning_rate": 3.951532567049809e-05, |
|
"loss": 12.367, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 16.85823754789272, |
|
"grad_norm": 1.4049383401870728, |
|
"learning_rate": 3.9467432950191574e-05, |
|
"loss": 12.1073, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 16.93486590038314, |
|
"grad_norm": 1.8470909595489502, |
|
"learning_rate": 3.941954022988506e-05, |
|
"loss": 12.3757, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 13.134416580200195, |
|
"eval_runtime": 44.0763, |
|
"eval_samples_per_second": 29.608, |
|
"eval_steps_per_second": 3.721, |
|
"step": 22185 |
|
}, |
|
{ |
|
"epoch": 17.011494252873565, |
|
"grad_norm": 1.1388458013534546, |
|
"learning_rate": 3.937164750957855e-05, |
|
"loss": 12.6443, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 17.088122605363985, |
|
"grad_norm": 1.202028512954712, |
|
"learning_rate": 3.9323754789272034e-05, |
|
"loss": 12.3013, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 17.164750957854405, |
|
"grad_norm": 1.210375189781189, |
|
"learning_rate": 3.927586206896552e-05, |
|
"loss": 12.4812, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 17.24137931034483, |
|
"grad_norm": 1.6550730466842651, |
|
"learning_rate": 3.922796934865901e-05, |
|
"loss": 12.3152, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 17.31800766283525, |
|
"grad_norm": 1.5777093172073364, |
|
"learning_rate": 3.918007662835249e-05, |
|
"loss": 12.2296, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 17.39463601532567, |
|
"grad_norm": 7.877992153167725, |
|
"learning_rate": 3.9132183908045975e-05, |
|
"loss": 12.4408, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 17.47126436781609, |
|
"grad_norm": 1.6760473251342773, |
|
"learning_rate": 3.908429118773946e-05, |
|
"loss": 12.251, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 17.547892720306514, |
|
"grad_norm": 2.4793410301208496, |
|
"learning_rate": 3.903639846743295e-05, |
|
"loss": 12.3864, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 17.624521072796934, |
|
"grad_norm": 1.331120491027832, |
|
"learning_rate": 3.8988505747126436e-05, |
|
"loss": 12.0078, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 17.701149425287355, |
|
"grad_norm": 1.1477069854736328, |
|
"learning_rate": 3.894109195402299e-05, |
|
"loss": 12.2234, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 17.77777777777778, |
|
"grad_norm": 1.5665520429611206, |
|
"learning_rate": 3.8893199233716474e-05, |
|
"loss": 12.2716, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 17.8544061302682, |
|
"grad_norm": 1.4720168113708496, |
|
"learning_rate": 3.884530651340996e-05, |
|
"loss": 12.2528, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 17.93103448275862, |
|
"grad_norm": 1.4990317821502686, |
|
"learning_rate": 3.879741379310345e-05, |
|
"loss": 12.4111, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 13.10958194732666, |
|
"eval_runtime": 43.9884, |
|
"eval_samples_per_second": 29.667, |
|
"eval_steps_per_second": 3.728, |
|
"step": 23490 |
|
}, |
|
{ |
|
"epoch": 18.007662835249043, |
|
"grad_norm": 1.653239130973816, |
|
"learning_rate": 3.8749521072796935e-05, |
|
"loss": 12.4558, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 18.084291187739463, |
|
"grad_norm": 1.3574182987213135, |
|
"learning_rate": 3.870162835249042e-05, |
|
"loss": 12.3242, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 18.160919540229884, |
|
"grad_norm": 2.0138070583343506, |
|
"learning_rate": 3.865373563218391e-05, |
|
"loss": 12.2255, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 18.237547892720308, |
|
"grad_norm": 1.6546958684921265, |
|
"learning_rate": 3.8605842911877396e-05, |
|
"loss": 12.3826, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 18.314176245210728, |
|
"grad_norm": 1.304247498512268, |
|
"learning_rate": 3.855795019157088e-05, |
|
"loss": 12.1766, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 18.39080459770115, |
|
"grad_norm": 1.109941005706787, |
|
"learning_rate": 3.851005747126437e-05, |
|
"loss": 12.3784, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 18.467432950191572, |
|
"grad_norm": 4.5435872077941895, |
|
"learning_rate": 3.8462164750957856e-05, |
|
"loss": 12.2292, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 18.544061302681992, |
|
"grad_norm": 2.141022205352783, |
|
"learning_rate": 3.841427203065134e-05, |
|
"loss": 12.2826, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 18.620689655172413, |
|
"grad_norm": 1.6946494579315186, |
|
"learning_rate": 3.836637931034483e-05, |
|
"loss": 12.3012, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 18.697318007662837, |
|
"grad_norm": 1.3159388303756714, |
|
"learning_rate": 3.831848659003832e-05, |
|
"loss": 12.1835, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 18.773946360153257, |
|
"grad_norm": 2.499986410140991, |
|
"learning_rate": 3.8270593869731804e-05, |
|
"loss": 12.4302, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 18.850574712643677, |
|
"grad_norm": 1.7443987131118774, |
|
"learning_rate": 3.822270114942529e-05, |
|
"loss": 12.5402, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 18.9272030651341, |
|
"grad_norm": 1.4758720397949219, |
|
"learning_rate": 3.817480842911878e-05, |
|
"loss": 12.3978, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 13.101744651794434, |
|
"eval_runtime": 43.9919, |
|
"eval_samples_per_second": 29.665, |
|
"eval_steps_per_second": 3.728, |
|
"step": 24795 |
|
}, |
|
{ |
|
"epoch": 19.00383141762452, |
|
"grad_norm": 1.774843454360962, |
|
"learning_rate": 3.812691570881226e-05, |
|
"loss": 12.2954, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 19.080459770114942, |
|
"grad_norm": 1.693176031112671, |
|
"learning_rate": 3.8079022988505745e-05, |
|
"loss": 12.3156, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 19.157088122605366, |
|
"grad_norm": 1.3531700372695923, |
|
"learning_rate": 3.803113026819923e-05, |
|
"loss": 12.3989, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 19.233716475095786, |
|
"grad_norm": 2.083587884902954, |
|
"learning_rate": 3.798323754789272e-05, |
|
"loss": 12.3523, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 19.310344827586206, |
|
"grad_norm": 2.1645917892456055, |
|
"learning_rate": 3.793534482758621e-05, |
|
"loss": 12.0512, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 19.386973180076627, |
|
"grad_norm": 1.8869907855987549, |
|
"learning_rate": 3.78874521072797e-05, |
|
"loss": 12.4837, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 19.46360153256705, |
|
"grad_norm": 1.2421497106552124, |
|
"learning_rate": 3.7840038314176244e-05, |
|
"loss": 11.9937, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 19.54022988505747, |
|
"grad_norm": 1.5155110359191895, |
|
"learning_rate": 3.779214559386973e-05, |
|
"loss": 12.2264, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 19.61685823754789, |
|
"grad_norm": 1.1511332988739014, |
|
"learning_rate": 3.774425287356322e-05, |
|
"loss": 12.2063, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 19.693486590038315, |
|
"grad_norm": 1.8984183073043823, |
|
"learning_rate": 3.7696360153256705e-05, |
|
"loss": 12.3237, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 19.770114942528735, |
|
"grad_norm": 0.9674005508422852, |
|
"learning_rate": 3.764846743295019e-05, |
|
"loss": 12.1877, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 19.846743295019156, |
|
"grad_norm": 2.0560641288757324, |
|
"learning_rate": 3.7600574712643685e-05, |
|
"loss": 12.2343, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 19.92337164750958, |
|
"grad_norm": 1.3923600912094116, |
|
"learning_rate": 3.755268199233717e-05, |
|
"loss": 12.2683, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 2.9314024448394775, |
|
"learning_rate": 3.750478927203065e-05, |
|
"loss": 12.3074, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 13.07620906829834, |
|
"eval_runtime": 43.9934, |
|
"eval_samples_per_second": 29.664, |
|
"eval_steps_per_second": 3.728, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 20.07662835249042, |
|
"grad_norm": 1.5305142402648926, |
|
"learning_rate": 3.745689655172414e-05, |
|
"loss": 12.2615, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 20.153256704980844, |
|
"grad_norm": 1.3846060037612915, |
|
"learning_rate": 3.7409003831417626e-05, |
|
"loss": 12.3109, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 20.229885057471265, |
|
"grad_norm": 3.0465173721313477, |
|
"learning_rate": 3.736111111111111e-05, |
|
"loss": 12.258, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 20.306513409961685, |
|
"grad_norm": 3.9723782539367676, |
|
"learning_rate": 3.73132183908046e-05, |
|
"loss": 12.2494, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 20.38314176245211, |
|
"grad_norm": 1.464296817779541, |
|
"learning_rate": 3.7265325670498086e-05, |
|
"loss": 12.2231, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 20.45977011494253, |
|
"grad_norm": 1.6789374351501465, |
|
"learning_rate": 3.721743295019157e-05, |
|
"loss": 12.3391, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 20.53639846743295, |
|
"grad_norm": 1.1731619834899902, |
|
"learning_rate": 3.716954022988506e-05, |
|
"loss": 12.2172, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 20.613026819923373, |
|
"grad_norm": 2.8839802742004395, |
|
"learning_rate": 3.712164750957855e-05, |
|
"loss": 12.251, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 20.689655172413794, |
|
"grad_norm": 1.3104863166809082, |
|
"learning_rate": 3.707375478927203e-05, |
|
"loss": 12.4269, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 20.766283524904214, |
|
"grad_norm": 2.5182230472564697, |
|
"learning_rate": 3.7025862068965514e-05, |
|
"loss": 12.1972, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 20.842911877394634, |
|
"grad_norm": 1.4510316848754883, |
|
"learning_rate": 3.6977969348659e-05, |
|
"loss": 12.1446, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 20.919540229885058, |
|
"grad_norm": 1.7377287149429321, |
|
"learning_rate": 3.6930076628352495e-05, |
|
"loss": 12.2374, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 20.99616858237548, |
|
"grad_norm": 1.308686375617981, |
|
"learning_rate": 3.6882662835249046e-05, |
|
"loss": 12.2169, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 13.027502059936523, |
|
"eval_runtime": 44.025, |
|
"eval_samples_per_second": 29.642, |
|
"eval_steps_per_second": 3.725, |
|
"step": 27405 |
|
}, |
|
{ |
|
"epoch": 21.0727969348659, |
|
"grad_norm": 1.7697923183441162, |
|
"learning_rate": 3.6834770114942526e-05, |
|
"loss": 12.3711, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 21.149425287356323, |
|
"grad_norm": 1.2963312864303589, |
|
"learning_rate": 3.678687739463601e-05, |
|
"loss": 12.1974, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 21.226053639846743, |
|
"grad_norm": 1.617470383644104, |
|
"learning_rate": 3.67389846743295e-05, |
|
"loss": 12.1879, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 21.302681992337163, |
|
"grad_norm": 2.007051944732666, |
|
"learning_rate": 3.669109195402299e-05, |
|
"loss": 12.2758, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 21.379310344827587, |
|
"grad_norm": 1.4421669244766235, |
|
"learning_rate": 3.6643199233716474e-05, |
|
"loss": 12.1852, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 21.455938697318008, |
|
"grad_norm": 2.678457260131836, |
|
"learning_rate": 3.659530651340997e-05, |
|
"loss": 12.3418, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 21.532567049808428, |
|
"grad_norm": 1.4007712602615356, |
|
"learning_rate": 3.6547413793103455e-05, |
|
"loss": 12.4764, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 21.60919540229885, |
|
"grad_norm": 4.606558322906494, |
|
"learning_rate": 3.649952107279694e-05, |
|
"loss": 12.2566, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 21.685823754789272, |
|
"grad_norm": 1.354705810546875, |
|
"learning_rate": 3.645162835249042e-05, |
|
"loss": 12.2371, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 21.762452107279692, |
|
"grad_norm": 1.7736151218414307, |
|
"learning_rate": 3.640373563218391e-05, |
|
"loss": 12.4794, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 21.839080459770116, |
|
"grad_norm": 1.2875999212265015, |
|
"learning_rate": 3.6355842911877395e-05, |
|
"loss": 12.0016, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 21.915708812260537, |
|
"grad_norm": 1.932035207748413, |
|
"learning_rate": 3.630795019157088e-05, |
|
"loss": 12.3018, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 21.992337164750957, |
|
"grad_norm": 3.066443920135498, |
|
"learning_rate": 3.626005747126437e-05, |
|
"loss": 12.0117, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 13.03292179107666, |
|
"eval_runtime": 44.0005, |
|
"eval_samples_per_second": 29.659, |
|
"eval_steps_per_second": 3.727, |
|
"step": 28710 |
|
}, |
|
{ |
|
"epoch": 22.06896551724138, |
|
"grad_norm": 0.97423255443573, |
|
"learning_rate": 3.6212164750957856e-05, |
|
"loss": 12.4442, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 22.1455938697318, |
|
"grad_norm": 1.7552623748779297, |
|
"learning_rate": 3.616427203065134e-05, |
|
"loss": 12.2976, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 22.22222222222222, |
|
"grad_norm": 1.5857703685760498, |
|
"learning_rate": 3.611637931034483e-05, |
|
"loss": 12.1968, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 22.298850574712645, |
|
"grad_norm": 1.381238341331482, |
|
"learning_rate": 3.6068486590038317e-05, |
|
"loss": 12.0455, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 22.375478927203066, |
|
"grad_norm": 1.3380298614501953, |
|
"learning_rate": 3.6020593869731803e-05, |
|
"loss": 12.1833, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 22.452107279693486, |
|
"grad_norm": 2.3591909408569336, |
|
"learning_rate": 3.5972701149425284e-05, |
|
"loss": 12.1562, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 22.52873563218391, |
|
"grad_norm": 2.544651508331299, |
|
"learning_rate": 3.592528735632184e-05, |
|
"loss": 12.1318, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 22.60536398467433, |
|
"grad_norm": 1.204476237297058, |
|
"learning_rate": 3.587739463601533e-05, |
|
"loss": 12.3856, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 22.68199233716475, |
|
"grad_norm": 1.453444004058838, |
|
"learning_rate": 3.5829501915708816e-05, |
|
"loss": 12.0971, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 22.75862068965517, |
|
"grad_norm": 2.287437915802002, |
|
"learning_rate": 3.5781609195402296e-05, |
|
"loss": 12.1294, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 22.835249042145595, |
|
"grad_norm": 2.790942907333374, |
|
"learning_rate": 3.573371647509578e-05, |
|
"loss": 12.1613, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 22.911877394636015, |
|
"grad_norm": 1.6170670986175537, |
|
"learning_rate": 3.568582375478927e-05, |
|
"loss": 12.0175, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 22.988505747126435, |
|
"grad_norm": 1.724195122718811, |
|
"learning_rate": 3.5637931034482757e-05, |
|
"loss": 12.1815, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 12.992958068847656, |
|
"eval_runtime": 44.0141, |
|
"eval_samples_per_second": 29.65, |
|
"eval_steps_per_second": 3.726, |
|
"step": 30015 |
|
}, |
|
{ |
|
"epoch": 23.06513409961686, |
|
"grad_norm": 3.8932502269744873, |
|
"learning_rate": 3.559003831417625e-05, |
|
"loss": 12.1987, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 23.14176245210728, |
|
"grad_norm": 1.8813198804855347, |
|
"learning_rate": 3.554214559386974e-05, |
|
"loss": 12.2208, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 23.2183908045977, |
|
"grad_norm": 1.0299080610275269, |
|
"learning_rate": 3.5494252873563224e-05, |
|
"loss": 12.1662, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 23.295019157088124, |
|
"grad_norm": 2.68420672416687, |
|
"learning_rate": 3.544636015325671e-05, |
|
"loss": 12.1013, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 23.371647509578544, |
|
"grad_norm": 0.9587434530258179, |
|
"learning_rate": 3.539846743295019e-05, |
|
"loss": 12.3426, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 23.448275862068964, |
|
"grad_norm": 1.8168953657150269, |
|
"learning_rate": 3.535057471264368e-05, |
|
"loss": 12.2303, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 23.52490421455939, |
|
"grad_norm": 1.2712435722351074, |
|
"learning_rate": 3.5302681992337165e-05, |
|
"loss": 12.275, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 23.60153256704981, |
|
"grad_norm": 1.0442867279052734, |
|
"learning_rate": 3.525478927203065e-05, |
|
"loss": 12.1344, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 23.67816091954023, |
|
"grad_norm": 2.2171154022216797, |
|
"learning_rate": 3.520689655172414e-05, |
|
"loss": 12.1554, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 23.754789272030653, |
|
"grad_norm": 1.5863583087921143, |
|
"learning_rate": 3.5159003831417625e-05, |
|
"loss": 12.1003, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 23.831417624521073, |
|
"grad_norm": 1.4239143133163452, |
|
"learning_rate": 3.511111111111111e-05, |
|
"loss": 12.1271, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 23.908045977011493, |
|
"grad_norm": 2.044018030166626, |
|
"learning_rate": 3.50632183908046e-05, |
|
"loss": 12.3269, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 23.984674329501917, |
|
"grad_norm": 2.9049460887908936, |
|
"learning_rate": 3.5015325670498086e-05, |
|
"loss": 12.0403, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 13.009976387023926, |
|
"eval_runtime": 44.0062, |
|
"eval_samples_per_second": 29.655, |
|
"eval_steps_per_second": 3.727, |
|
"step": 31320 |
|
}, |
|
{ |
|
"epoch": 24.061302681992338, |
|
"grad_norm": 1.4207292795181274, |
|
"learning_rate": 3.496743295019157e-05, |
|
"loss": 12.0634, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 24.137931034482758, |
|
"grad_norm": 1.886399269104004, |
|
"learning_rate": 3.491954022988506e-05, |
|
"loss": 12.1573, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 24.21455938697318, |
|
"grad_norm": 2.239217519760132, |
|
"learning_rate": 3.487164750957855e-05, |
|
"loss": 12.3025, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 24.291187739463602, |
|
"grad_norm": 1.495377540588379, |
|
"learning_rate": 3.4823754789272034e-05, |
|
"loss": 12.1236, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 24.367816091954023, |
|
"grad_norm": 1.4570187330245972, |
|
"learning_rate": 3.477586206896552e-05, |
|
"loss": 12.1341, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 24.444444444444443, |
|
"grad_norm": 1.137839674949646, |
|
"learning_rate": 3.472796934865901e-05, |
|
"loss": 12.1097, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 24.521072796934867, |
|
"grad_norm": 1.9981390237808228, |
|
"learning_rate": 3.4680076628352494e-05, |
|
"loss": 12.4374, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 24.597701149425287, |
|
"grad_norm": 1.6802810430526733, |
|
"learning_rate": 3.463218390804598e-05, |
|
"loss": 12.0851, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 24.674329501915707, |
|
"grad_norm": 2.0081875324249268, |
|
"learning_rate": 3.458429118773947e-05, |
|
"loss": 12.0883, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 24.75095785440613, |
|
"grad_norm": 2.637779474258423, |
|
"learning_rate": 3.453639846743295e-05, |
|
"loss": 12.198, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 24.82758620689655, |
|
"grad_norm": 6.473161220550537, |
|
"learning_rate": 3.4488505747126435e-05, |
|
"loss": 12.1459, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 24.904214559386972, |
|
"grad_norm": 1.3531584739685059, |
|
"learning_rate": 3.444061302681992e-05, |
|
"loss": 12.0297, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 24.980842911877396, |
|
"grad_norm": 1.2492320537567139, |
|
"learning_rate": 3.439272030651341e-05, |
|
"loss": 12.0907, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 12.98237419128418, |
|
"eval_runtime": 44.0055, |
|
"eval_samples_per_second": 29.655, |
|
"eval_steps_per_second": 3.727, |
|
"step": 32625 |
|
}, |
|
{ |
|
"epoch": 25.057471264367816, |
|
"grad_norm": 1.2564047574996948, |
|
"learning_rate": 3.4344827586206896e-05, |
|
"loss": 12.3271, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 25.134099616858236, |
|
"grad_norm": 1.6601101160049438, |
|
"learning_rate": 3.429741379310345e-05, |
|
"loss": 12.2568, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 25.21072796934866, |
|
"grad_norm": 1.8177669048309326, |
|
"learning_rate": 3.4249521072796934e-05, |
|
"loss": 12.2059, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 25.28735632183908, |
|
"grad_norm": 1.5476176738739014, |
|
"learning_rate": 3.420162835249042e-05, |
|
"loss": 12.2871, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 25.3639846743295, |
|
"grad_norm": 1.305198073387146, |
|
"learning_rate": 3.415373563218391e-05, |
|
"loss": 12.258, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 25.440613026819925, |
|
"grad_norm": 5.837198257446289, |
|
"learning_rate": 3.4105842911877395e-05, |
|
"loss": 12.0855, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 25.517241379310345, |
|
"grad_norm": 2.148789882659912, |
|
"learning_rate": 3.405795019157088e-05, |
|
"loss": 12.1539, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 25.593869731800766, |
|
"grad_norm": 1.8985601663589478, |
|
"learning_rate": 3.401005747126437e-05, |
|
"loss": 12.2977, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 25.67049808429119, |
|
"grad_norm": 1.9121934175491333, |
|
"learning_rate": 3.3962164750957855e-05, |
|
"loss": 12.0616, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 25.74712643678161, |
|
"grad_norm": 1.3972700834274292, |
|
"learning_rate": 3.391427203065134e-05, |
|
"loss": 12.0951, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 25.82375478927203, |
|
"grad_norm": 1.3285768032073975, |
|
"learning_rate": 3.386637931034483e-05, |
|
"loss": 12.0531, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 25.900383141762454, |
|
"grad_norm": 2.199030876159668, |
|
"learning_rate": 3.3818486590038316e-05, |
|
"loss": 11.9635, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 25.977011494252874, |
|
"grad_norm": 1.0486905574798584, |
|
"learning_rate": 3.37705938697318e-05, |
|
"loss": 11.9477, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 12.954750061035156, |
|
"eval_runtime": 44.0151, |
|
"eval_samples_per_second": 29.649, |
|
"eval_steps_per_second": 3.726, |
|
"step": 33930 |
|
}, |
|
{ |
|
"epoch": 26.053639846743295, |
|
"grad_norm": 1.8525198698043823, |
|
"learning_rate": 3.372270114942529e-05, |
|
"loss": 11.9857, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 26.130268199233715, |
|
"grad_norm": 1.4454785585403442, |
|
"learning_rate": 3.367480842911878e-05, |
|
"loss": 11.8142, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 26.20689655172414, |
|
"grad_norm": 1.6828280687332153, |
|
"learning_rate": 3.3626915708812264e-05, |
|
"loss": 11.9359, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 26.28352490421456, |
|
"grad_norm": 1.898542046546936, |
|
"learning_rate": 3.357902298850575e-05, |
|
"loss": 12.3808, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 26.36015325670498, |
|
"grad_norm": 1.3259601593017578, |
|
"learning_rate": 3.353113026819924e-05, |
|
"loss": 11.9188, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 26.436781609195403, |
|
"grad_norm": 1.2543106079101562, |
|
"learning_rate": 3.348323754789272e-05, |
|
"loss": 12.2622, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 26.513409961685824, |
|
"grad_norm": 1.1741349697113037, |
|
"learning_rate": 3.3435344827586204e-05, |
|
"loss": 12.3296, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 26.590038314176244, |
|
"grad_norm": 2.937052011489868, |
|
"learning_rate": 3.338745210727969e-05, |
|
"loss": 12.0383, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 1.5736559629440308, |
|
"learning_rate": 3.333955938697318e-05, |
|
"loss": 12.178, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 26.743295019157088, |
|
"grad_norm": 1.9110735654830933, |
|
"learning_rate": 3.329214559386974e-05, |
|
"loss": 12.223, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 26.81992337164751, |
|
"grad_norm": 0.9110540747642517, |
|
"learning_rate": 3.324425287356322e-05, |
|
"loss": 12.1191, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 26.896551724137932, |
|
"grad_norm": 1.3772426843643188, |
|
"learning_rate": 3.3196360153256704e-05, |
|
"loss": 12.1527, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 26.973180076628353, |
|
"grad_norm": 1.5747685432434082, |
|
"learning_rate": 3.314846743295019e-05, |
|
"loss": 12.093, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 12.915553092956543, |
|
"eval_runtime": 44.0197, |
|
"eval_samples_per_second": 29.646, |
|
"eval_steps_per_second": 3.726, |
|
"step": 35235 |
|
}, |
|
{ |
|
"epoch": 27.049808429118773, |
|
"grad_norm": 1.285940408706665, |
|
"learning_rate": 3.310057471264368e-05, |
|
"loss": 12.1302, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 27.126436781609197, |
|
"grad_norm": 1.3924872875213623, |
|
"learning_rate": 3.3052681992337164e-05, |
|
"loss": 12.2251, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 27.203065134099617, |
|
"grad_norm": 3.2285568714141846, |
|
"learning_rate": 3.300478927203065e-05, |
|
"loss": 12.1551, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 27.279693486590038, |
|
"grad_norm": 1.9970892667770386, |
|
"learning_rate": 3.295689655172414e-05, |
|
"loss": 12.1276, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 27.35632183908046, |
|
"grad_norm": 1.5273020267486572, |
|
"learning_rate": 3.290900383141763e-05, |
|
"loss": 12.3051, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 27.43295019157088, |
|
"grad_norm": 1.3356541395187378, |
|
"learning_rate": 3.286111111111111e-05, |
|
"loss": 12.1591, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 27.509578544061302, |
|
"grad_norm": 1.1603785753250122, |
|
"learning_rate": 3.28132183908046e-05, |
|
"loss": 11.9451, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 27.586206896551722, |
|
"grad_norm": 1.2263092994689941, |
|
"learning_rate": 3.2765325670498086e-05, |
|
"loss": 12.069, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 27.662835249042146, |
|
"grad_norm": 2.639704465866089, |
|
"learning_rate": 3.271743295019157e-05, |
|
"loss": 12.0213, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 27.739463601532567, |
|
"grad_norm": 1.1907585859298706, |
|
"learning_rate": 3.266954022988506e-05, |
|
"loss": 12.0336, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 27.816091954022987, |
|
"grad_norm": 2.5226128101348877, |
|
"learning_rate": 3.2621647509578546e-05, |
|
"loss": 12.1515, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 27.89272030651341, |
|
"grad_norm": 1.263527274131775, |
|
"learning_rate": 3.257375478927203e-05, |
|
"loss": 12.067, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 27.96934865900383, |
|
"grad_norm": 1.636793613433838, |
|
"learning_rate": 3.252586206896552e-05, |
|
"loss": 12.14, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 12.91286563873291, |
|
"eval_runtime": 44.033, |
|
"eval_samples_per_second": 29.637, |
|
"eval_steps_per_second": 3.724, |
|
"step": 36540 |
|
}, |
|
{ |
|
"epoch": 28.04597701149425, |
|
"grad_norm": 1.691573977470398, |
|
"learning_rate": 3.247796934865901e-05, |
|
"loss": 12.0472, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 28.122605363984675, |
|
"grad_norm": 2.2020788192749023, |
|
"learning_rate": 3.2430076628352494e-05, |
|
"loss": 12.0171, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 28.199233716475096, |
|
"grad_norm": 1.9675192832946777, |
|
"learning_rate": 3.2382183908045974e-05, |
|
"loss": 12.1335, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 28.275862068965516, |
|
"grad_norm": 2.210883378982544, |
|
"learning_rate": 3.233429118773946e-05, |
|
"loss": 12.065, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 28.35249042145594, |
|
"grad_norm": 1.4574834108352661, |
|
"learning_rate": 3.2286398467432954e-05, |
|
"loss": 12.0635, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 28.42911877394636, |
|
"grad_norm": 2.1000685691833496, |
|
"learning_rate": 3.223850574712644e-05, |
|
"loss": 12.2908, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 28.50574712643678, |
|
"grad_norm": 2.088956832885742, |
|
"learning_rate": 3.2191091954022986e-05, |
|
"loss": 12.2421, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 28.582375478927204, |
|
"grad_norm": 1.5785751342773438, |
|
"learning_rate": 3.214319923371647e-05, |
|
"loss": 12.0568, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 28.659003831417625, |
|
"grad_norm": 1.5230878591537476, |
|
"learning_rate": 3.209530651340996e-05, |
|
"loss": 12.0995, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 28.735632183908045, |
|
"grad_norm": 1.1175010204315186, |
|
"learning_rate": 3.204741379310345e-05, |
|
"loss": 12.17, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 28.81226053639847, |
|
"grad_norm": 1.6524131298065186, |
|
"learning_rate": 3.1999521072796934e-05, |
|
"loss": 12.1192, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 28.88888888888889, |
|
"grad_norm": 1.5143946409225464, |
|
"learning_rate": 3.195162835249042e-05, |
|
"loss": 11.9995, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 28.96551724137931, |
|
"grad_norm": 1.2787953615188599, |
|
"learning_rate": 3.1903735632183914e-05, |
|
"loss": 12.0876, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 12.9454984664917, |
|
"eval_runtime": 44.0594, |
|
"eval_samples_per_second": 29.619, |
|
"eval_steps_per_second": 3.722, |
|
"step": 37845 |
|
}, |
|
{ |
|
"epoch": 29.042145593869733, |
|
"grad_norm": 1.4434622526168823, |
|
"learning_rate": 3.18558429118774e-05, |
|
"loss": 11.8509, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 29.118773946360154, |
|
"grad_norm": 1.2989375591278076, |
|
"learning_rate": 3.180795019157088e-05, |
|
"loss": 12.1473, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 29.195402298850574, |
|
"grad_norm": 1.6747602224349976, |
|
"learning_rate": 3.176005747126437e-05, |
|
"loss": 12.1781, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 29.272030651340994, |
|
"grad_norm": 2.2328062057495117, |
|
"learning_rate": 3.1712164750957855e-05, |
|
"loss": 12.2881, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 29.34865900383142, |
|
"grad_norm": 2.3226537704467773, |
|
"learning_rate": 3.166427203065134e-05, |
|
"loss": 12.0132, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 29.42528735632184, |
|
"grad_norm": 1.7786709070205688, |
|
"learning_rate": 3.161637931034483e-05, |
|
"loss": 12.2086, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 29.50191570881226, |
|
"grad_norm": 2.359247922897339, |
|
"learning_rate": 3.1568486590038316e-05, |
|
"loss": 12.3037, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 29.578544061302683, |
|
"grad_norm": 1.661720633506775, |
|
"learning_rate": 3.15205938697318e-05, |
|
"loss": 11.9945, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 29.655172413793103, |
|
"grad_norm": 1.2464226484298706, |
|
"learning_rate": 3.147270114942529e-05, |
|
"loss": 12.0475, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 29.731800766283524, |
|
"grad_norm": 5.234483242034912, |
|
"learning_rate": 3.1424808429118776e-05, |
|
"loss": 12.1442, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 29.808429118773947, |
|
"grad_norm": 1.2800259590148926, |
|
"learning_rate": 3.137691570881226e-05, |
|
"loss": 11.923, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 29.885057471264368, |
|
"grad_norm": 1.3353965282440186, |
|
"learning_rate": 3.132902298850574e-05, |
|
"loss": 12.0991, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 29.961685823754788, |
|
"grad_norm": 1.974084734916687, |
|
"learning_rate": 3.128113026819924e-05, |
|
"loss": 12.0987, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 12.926346778869629, |
|
"eval_runtime": 44.1327, |
|
"eval_samples_per_second": 29.57, |
|
"eval_steps_per_second": 3.716, |
|
"step": 39150 |
|
}, |
|
{ |
|
"epoch": 30.038314176245212, |
|
"grad_norm": 2.184515953063965, |
|
"learning_rate": 3.1233237547892724e-05, |
|
"loss": 11.9969, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 30.114942528735632, |
|
"grad_norm": 3.448138952255249, |
|
"learning_rate": 3.1185823754789276e-05, |
|
"loss": 12.2465, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 30.191570881226053, |
|
"grad_norm": 1.5382182598114014, |
|
"learning_rate": 3.113793103448276e-05, |
|
"loss": 12.1218, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 30.268199233716476, |
|
"grad_norm": 1.4232020378112793, |
|
"learning_rate": 3.109003831417624e-05, |
|
"loss": 12.0744, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 30.344827586206897, |
|
"grad_norm": 1.130115270614624, |
|
"learning_rate": 3.104214559386973e-05, |
|
"loss": 11.982, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 30.421455938697317, |
|
"grad_norm": 0.9410238265991211, |
|
"learning_rate": 3.0994252873563216e-05, |
|
"loss": 11.9721, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 30.49808429118774, |
|
"grad_norm": 1.6789051294326782, |
|
"learning_rate": 3.09463601532567e-05, |
|
"loss": 12.2021, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 30.57471264367816, |
|
"grad_norm": 1.7361513376235962, |
|
"learning_rate": 3.08984674329502e-05, |
|
"loss": 12.1236, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 30.65134099616858, |
|
"grad_norm": 1.868490219116211, |
|
"learning_rate": 3.0850574712643684e-05, |
|
"loss": 12.0632, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 30.727969348659006, |
|
"grad_norm": 1.3586502075195312, |
|
"learning_rate": 3.080268199233717e-05, |
|
"loss": 12.0715, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 30.804597701149426, |
|
"grad_norm": 1.6496648788452148, |
|
"learning_rate": 3.075478927203066e-05, |
|
"loss": 12.0989, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 30.881226053639846, |
|
"grad_norm": 1.8671578168869019, |
|
"learning_rate": 3.070689655172414e-05, |
|
"loss": 11.996, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 30.957854406130267, |
|
"grad_norm": 0.9875293374061584, |
|
"learning_rate": 3.0659003831417624e-05, |
|
"loss": 12.0908, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 12.88086986541748, |
|
"eval_runtime": 44.1375, |
|
"eval_samples_per_second": 29.567, |
|
"eval_steps_per_second": 3.716, |
|
"step": 40455 |
|
}, |
|
{ |
|
"epoch": 31.03448275862069, |
|
"grad_norm": 4.194854259490967, |
|
"learning_rate": 3.061111111111111e-05, |
|
"loss": 12.0422, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 31.11111111111111, |
|
"grad_norm": 1.550528883934021, |
|
"learning_rate": 3.05632183908046e-05, |
|
"loss": 12.2051, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 31.18773946360153, |
|
"grad_norm": 2.011462450027466, |
|
"learning_rate": 3.0515325670498085e-05, |
|
"loss": 12.1084, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 31.264367816091955, |
|
"grad_norm": 1.100541114807129, |
|
"learning_rate": 3.0467432950191572e-05, |
|
"loss": 11.9174, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 31.340996168582375, |
|
"grad_norm": 1.1993151903152466, |
|
"learning_rate": 3.041954022988506e-05, |
|
"loss": 12.0801, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 31.417624521072796, |
|
"grad_norm": 1.501018762588501, |
|
"learning_rate": 3.0371647509578542e-05, |
|
"loss": 12.1011, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 31.49425287356322, |
|
"grad_norm": 1.788327932357788, |
|
"learning_rate": 3.032375478927203e-05, |
|
"loss": 12.192, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 31.57088122605364, |
|
"grad_norm": 1.7562750577926636, |
|
"learning_rate": 3.0275862068965523e-05, |
|
"loss": 11.829, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 31.64750957854406, |
|
"grad_norm": 1.467976450920105, |
|
"learning_rate": 3.0227969348659006e-05, |
|
"loss": 12.0685, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 31.724137931034484, |
|
"grad_norm": 2.4010770320892334, |
|
"learning_rate": 3.0180076628352493e-05, |
|
"loss": 12.0806, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 31.800766283524904, |
|
"grad_norm": 1.759490728378296, |
|
"learning_rate": 3.013218390804598e-05, |
|
"loss": 12.1422, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 31.877394636015325, |
|
"grad_norm": 1.6164530515670776, |
|
"learning_rate": 3.0084291187739467e-05, |
|
"loss": 12.0766, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 31.95402298850575, |
|
"grad_norm": 1.3001078367233276, |
|
"learning_rate": 3.0036398467432954e-05, |
|
"loss": 12.0244, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 12.876104354858398, |
|
"eval_runtime": 44.1527, |
|
"eval_samples_per_second": 29.557, |
|
"eval_steps_per_second": 3.714, |
|
"step": 41760 |
|
}, |
|
{ |
|
"epoch": 32.030651340996165, |
|
"grad_norm": 1.1984444856643677, |
|
"learning_rate": 2.9988505747126437e-05, |
|
"loss": 12.1453, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 32.10727969348659, |
|
"grad_norm": 0.9655357599258423, |
|
"learning_rate": 2.9941091954022986e-05, |
|
"loss": 11.8735, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 32.18390804597701, |
|
"grad_norm": 1.0667262077331543, |
|
"learning_rate": 2.989319923371648e-05, |
|
"loss": 12.1566, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 32.26053639846743, |
|
"grad_norm": 1.6131408214569092, |
|
"learning_rate": 2.9845306513409966e-05, |
|
"loss": 11.9729, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 32.337164750957854, |
|
"grad_norm": 1.6158314943313599, |
|
"learning_rate": 2.979741379310345e-05, |
|
"loss": 12.0362, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 32.41379310344828, |
|
"grad_norm": 1.189818263053894, |
|
"learning_rate": 2.9749521072796937e-05, |
|
"loss": 12.2135, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 32.490421455938694, |
|
"grad_norm": 2.628614664077759, |
|
"learning_rate": 2.9701628352490423e-05, |
|
"loss": 12.032, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 32.56704980842912, |
|
"grad_norm": 1.6809107065200806, |
|
"learning_rate": 2.965373563218391e-05, |
|
"loss": 11.81, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 32.64367816091954, |
|
"grad_norm": 1.6311430931091309, |
|
"learning_rate": 2.9605842911877397e-05, |
|
"loss": 11.9348, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 32.72030651340996, |
|
"grad_norm": 1.2387199401855469, |
|
"learning_rate": 2.955795019157088e-05, |
|
"loss": 12.0694, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 32.79693486590038, |
|
"grad_norm": 1.7171186208724976, |
|
"learning_rate": 2.9510057471264368e-05, |
|
"loss": 11.9729, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 32.87356321839081, |
|
"grad_norm": 1.6134984493255615, |
|
"learning_rate": 2.9462164750957854e-05, |
|
"loss": 12.1292, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 32.95019157088122, |
|
"grad_norm": 2.2401788234710693, |
|
"learning_rate": 2.941427203065134e-05, |
|
"loss": 12.1613, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 12.873848915100098, |
|
"eval_runtime": 44.126, |
|
"eval_samples_per_second": 29.574, |
|
"eval_steps_per_second": 3.717, |
|
"step": 43065 |
|
}, |
|
{ |
|
"epoch": 33.02681992337165, |
|
"grad_norm": 1.260538935661316, |
|
"learning_rate": 2.9366379310344828e-05, |
|
"loss": 12.1855, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 33.10344827586207, |
|
"grad_norm": 1.7840496301651, |
|
"learning_rate": 2.9318486590038312e-05, |
|
"loss": 12.0618, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 33.18007662835249, |
|
"grad_norm": 1.162712574005127, |
|
"learning_rate": 2.92705938697318e-05, |
|
"loss": 12.2513, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 33.25670498084291, |
|
"grad_norm": 3.618567705154419, |
|
"learning_rate": 2.9222701149425292e-05, |
|
"loss": 12.0614, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 1.2605602741241455, |
|
"learning_rate": 2.9174808429118776e-05, |
|
"loss": 11.9763, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 33.40996168582375, |
|
"grad_norm": 1.4304360151290894, |
|
"learning_rate": 2.9126915708812263e-05, |
|
"loss": 12.1044, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 33.486590038314176, |
|
"grad_norm": 1.1767237186431885, |
|
"learning_rate": 2.907902298850575e-05, |
|
"loss": 11.8996, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 33.5632183908046, |
|
"grad_norm": 1.6173638105392456, |
|
"learning_rate": 2.9031130268199236e-05, |
|
"loss": 11.969, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 33.63984674329502, |
|
"grad_norm": 1.2231945991516113, |
|
"learning_rate": 2.8983237547892723e-05, |
|
"loss": 12.2301, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 33.71647509578544, |
|
"grad_norm": 3.853048801422119, |
|
"learning_rate": 2.8935344827586207e-05, |
|
"loss": 11.9726, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 33.793103448275865, |
|
"grad_norm": 1.4259275197982788, |
|
"learning_rate": 2.8887452107279694e-05, |
|
"loss": 11.9545, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 33.86973180076628, |
|
"grad_norm": 2.5803606510162354, |
|
"learning_rate": 2.883955938697318e-05, |
|
"loss": 11.8867, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 33.946360153256705, |
|
"grad_norm": 1.3688091039657593, |
|
"learning_rate": 2.8791666666666667e-05, |
|
"loss": 12.0033, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 12.871088027954102, |
|
"eval_runtime": 44.1202, |
|
"eval_samples_per_second": 29.578, |
|
"eval_steps_per_second": 3.717, |
|
"step": 44370 |
|
}, |
|
{ |
|
"epoch": 34.02298850574713, |
|
"grad_norm": 1.947970986366272, |
|
"learning_rate": 2.8743773946360154e-05, |
|
"loss": 11.9572, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 34.099616858237546, |
|
"grad_norm": 1.9568095207214355, |
|
"learning_rate": 2.8696360153256706e-05, |
|
"loss": 12.0624, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 34.17624521072797, |
|
"grad_norm": 1.4037648439407349, |
|
"learning_rate": 2.8648467432950193e-05, |
|
"loss": 11.8426, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 34.252873563218394, |
|
"grad_norm": 2.5989620685577393, |
|
"learning_rate": 2.860057471264368e-05, |
|
"loss": 11.9217, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 34.32950191570881, |
|
"grad_norm": 1.3627197742462158, |
|
"learning_rate": 2.8552681992337167e-05, |
|
"loss": 11.9418, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 34.406130268199234, |
|
"grad_norm": 1.4087576866149902, |
|
"learning_rate": 2.8504789272030654e-05, |
|
"loss": 12.1608, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 34.48275862068966, |
|
"grad_norm": 1.4856873750686646, |
|
"learning_rate": 2.8456896551724137e-05, |
|
"loss": 11.9778, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 34.559386973180075, |
|
"grad_norm": 1.631663203239441, |
|
"learning_rate": 2.8409003831417624e-05, |
|
"loss": 12.0547, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 34.6360153256705, |
|
"grad_norm": 2.1117138862609863, |
|
"learning_rate": 2.836111111111111e-05, |
|
"loss": 12.0824, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 34.71264367816092, |
|
"grad_norm": 1.9915541410446167, |
|
"learning_rate": 2.8313218390804598e-05, |
|
"loss": 12.0984, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 34.78927203065134, |
|
"grad_norm": 2.4851934909820557, |
|
"learning_rate": 2.8265325670498085e-05, |
|
"loss": 12.0646, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 34.86590038314176, |
|
"grad_norm": 1.1414411067962646, |
|
"learning_rate": 2.8217432950191575e-05, |
|
"loss": 12.0986, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 34.94252873563218, |
|
"grad_norm": 1.0578815937042236, |
|
"learning_rate": 2.8169540229885062e-05, |
|
"loss": 12.1035, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 12.84704875946045, |
|
"eval_runtime": 44.1331, |
|
"eval_samples_per_second": 29.57, |
|
"eval_steps_per_second": 3.716, |
|
"step": 45675 |
|
}, |
|
{ |
|
"epoch": 35.019157088122604, |
|
"grad_norm": 1.2231003046035767, |
|
"learning_rate": 2.812164750957855e-05, |
|
"loss": 12.2043, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 35.09578544061303, |
|
"grad_norm": 1.6044613122940063, |
|
"learning_rate": 2.8073754789272032e-05, |
|
"loss": 11.9987, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 35.172413793103445, |
|
"grad_norm": 1.208008050918579, |
|
"learning_rate": 2.802586206896552e-05, |
|
"loss": 11.7725, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 35.24904214559387, |
|
"grad_norm": 1.8152436017990112, |
|
"learning_rate": 2.7977969348659006e-05, |
|
"loss": 11.9232, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 35.32567049808429, |
|
"grad_norm": 0.9535597562789917, |
|
"learning_rate": 2.7930076628352493e-05, |
|
"loss": 12.2091, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 35.40229885057471, |
|
"grad_norm": 1.5778999328613281, |
|
"learning_rate": 2.7882183908045976e-05, |
|
"loss": 12.0968, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 35.47892720306513, |
|
"grad_norm": 1.5384963750839233, |
|
"learning_rate": 2.7834291187739463e-05, |
|
"loss": 12.1058, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 35.55555555555556, |
|
"grad_norm": 1.1971815824508667, |
|
"learning_rate": 2.778639846743295e-05, |
|
"loss": 12.048, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 35.632183908045974, |
|
"grad_norm": 1.2047299146652222, |
|
"learning_rate": 2.7738505747126437e-05, |
|
"loss": 12.0413, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 35.7088122605364, |
|
"grad_norm": 1.6629399061203003, |
|
"learning_rate": 2.7690613026819924e-05, |
|
"loss": 11.9562, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 35.78544061302682, |
|
"grad_norm": 1.8731905221939087, |
|
"learning_rate": 2.7642720306513407e-05, |
|
"loss": 12.0334, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 35.86206896551724, |
|
"grad_norm": 1.5753523111343384, |
|
"learning_rate": 2.75948275862069e-05, |
|
"loss": 11.9348, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 35.93869731800766, |
|
"grad_norm": 2.0848851203918457, |
|
"learning_rate": 2.7546934865900388e-05, |
|
"loss": 12.0199, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 12.837443351745605, |
|
"eval_runtime": 44.1529, |
|
"eval_samples_per_second": 29.556, |
|
"eval_steps_per_second": 3.714, |
|
"step": 46980 |
|
}, |
|
{ |
|
"epoch": 36.015325670498086, |
|
"grad_norm": 1.3191312551498413, |
|
"learning_rate": 2.749904214559387e-05, |
|
"loss": 12.1034, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 36.0919540229885, |
|
"grad_norm": 1.8107291460037231, |
|
"learning_rate": 2.7451149425287358e-05, |
|
"loss": 11.9679, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 36.16858237547893, |
|
"grad_norm": 2.29463529586792, |
|
"learning_rate": 2.7403735632183906e-05, |
|
"loss": 11.7111, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 36.24521072796935, |
|
"grad_norm": 1.3297805786132812, |
|
"learning_rate": 2.7355842911877393e-05, |
|
"loss": 11.8913, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 36.32183908045977, |
|
"grad_norm": 1.1663862466812134, |
|
"learning_rate": 2.730795019157088e-05, |
|
"loss": 12.0487, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 36.39846743295019, |
|
"grad_norm": 1.4846138954162598, |
|
"learning_rate": 2.7260057471264367e-05, |
|
"loss": 12.1661, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 36.475095785440615, |
|
"grad_norm": 1.8800255060195923, |
|
"learning_rate": 2.7212164750957857e-05, |
|
"loss": 11.9248, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 36.55172413793103, |
|
"grad_norm": 1.7427587509155273, |
|
"learning_rate": 2.7164272030651344e-05, |
|
"loss": 12.0681, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 36.628352490421456, |
|
"grad_norm": 2.0017685890197754, |
|
"learning_rate": 2.711637931034483e-05, |
|
"loss": 12.2556, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 36.70498084291188, |
|
"grad_norm": 2.765782117843628, |
|
"learning_rate": 2.7068486590038318e-05, |
|
"loss": 11.8846, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 36.7816091954023, |
|
"grad_norm": 1.519728422164917, |
|
"learning_rate": 2.70205938697318e-05, |
|
"loss": 12.0119, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 36.85823754789272, |
|
"grad_norm": 1.091073989868164, |
|
"learning_rate": 2.697270114942529e-05, |
|
"loss": 12.1197, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 36.934865900383144, |
|
"grad_norm": 1.3182342052459717, |
|
"learning_rate": 2.6924808429118775e-05, |
|
"loss": 12.0217, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 12.849996566772461, |
|
"eval_runtime": 44.1316, |
|
"eval_samples_per_second": 29.571, |
|
"eval_steps_per_second": 3.716, |
|
"step": 48285 |
|
}, |
|
{ |
|
"epoch": 37.01149425287356, |
|
"grad_norm": 1.9082536697387695, |
|
"learning_rate": 2.6876915708812262e-05, |
|
"loss": 12.2391, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 37.088122605363985, |
|
"grad_norm": 1.5705393552780151, |
|
"learning_rate": 2.682902298850575e-05, |
|
"loss": 12.1329, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 37.16475095785441, |
|
"grad_norm": 2.2240869998931885, |
|
"learning_rate": 2.6781130268199233e-05, |
|
"loss": 12.108, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 37.241379310344826, |
|
"grad_norm": 1.357383370399475, |
|
"learning_rate": 2.673323754789272e-05, |
|
"loss": 11.9599, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 37.31800766283525, |
|
"grad_norm": 2.1634521484375, |
|
"learning_rate": 2.6685344827586206e-05, |
|
"loss": 12.0339, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 37.39463601532567, |
|
"grad_norm": 1.611195683479309, |
|
"learning_rate": 2.6637452107279693e-05, |
|
"loss": 12.0276, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 37.47126436781609, |
|
"grad_norm": 1.3676810264587402, |
|
"learning_rate": 2.6589559386973183e-05, |
|
"loss": 11.9487, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 37.547892720306514, |
|
"grad_norm": 1.4503991603851318, |
|
"learning_rate": 2.654166666666667e-05, |
|
"loss": 11.9166, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 37.62452107279694, |
|
"grad_norm": 2.0941789150238037, |
|
"learning_rate": 2.6493773946360157e-05, |
|
"loss": 12.0909, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 37.701149425287355, |
|
"grad_norm": 1.4591392278671265, |
|
"learning_rate": 2.6445881226053644e-05, |
|
"loss": 11.9453, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 37.77777777777778, |
|
"grad_norm": 1.3402618169784546, |
|
"learning_rate": 2.6397988505747128e-05, |
|
"loss": 11.9431, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 37.8544061302682, |
|
"grad_norm": 1.697449803352356, |
|
"learning_rate": 2.6350095785440614e-05, |
|
"loss": 11.8129, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 37.93103448275862, |
|
"grad_norm": 1.5764317512512207, |
|
"learning_rate": 2.63022030651341e-05, |
|
"loss": 11.975, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 12.832439422607422, |
|
"eval_runtime": 44.0844, |
|
"eval_samples_per_second": 29.602, |
|
"eval_steps_per_second": 3.72, |
|
"step": 49590 |
|
}, |
|
{ |
|
"epoch": 38.00766283524904, |
|
"grad_norm": 3.7600104808807373, |
|
"learning_rate": 2.6254310344827588e-05, |
|
"loss": 12.1701, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 38.08429118773947, |
|
"grad_norm": 1.9188120365142822, |
|
"learning_rate": 2.6206417624521075e-05, |
|
"loss": 12.0672, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 38.160919540229884, |
|
"grad_norm": 1.5679752826690674, |
|
"learning_rate": 2.615852490421456e-05, |
|
"loss": 11.9374, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 38.23754789272031, |
|
"grad_norm": 1.6603142023086548, |
|
"learning_rate": 2.6110632183908045e-05, |
|
"loss": 11.8708, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 38.31417624521073, |
|
"grad_norm": 2.0302236080169678, |
|
"learning_rate": 2.6062739463601532e-05, |
|
"loss": 12.0997, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 38.39080459770115, |
|
"grad_norm": 1.4646397829055786, |
|
"learning_rate": 2.601484674329502e-05, |
|
"loss": 12.1337, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 38.46743295019157, |
|
"grad_norm": 2.1434216499328613, |
|
"learning_rate": 2.5966954022988506e-05, |
|
"loss": 12.063, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 38.54406130268199, |
|
"grad_norm": 1.4451220035552979, |
|
"learning_rate": 2.5919061302681996e-05, |
|
"loss": 11.8743, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 38.62068965517241, |
|
"grad_norm": 1.4875038862228394, |
|
"learning_rate": 2.5871168582375483e-05, |
|
"loss": 12.1545, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 38.69731800766284, |
|
"grad_norm": 2.4424338340759277, |
|
"learning_rate": 2.582327586206897e-05, |
|
"loss": 11.9573, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 38.77394636015325, |
|
"grad_norm": 1.0890432596206665, |
|
"learning_rate": 2.5775383141762454e-05, |
|
"loss": 11.894, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 38.85057471264368, |
|
"grad_norm": 1.410107970237732, |
|
"learning_rate": 2.572749042145594e-05, |
|
"loss": 12.0408, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 38.9272030651341, |
|
"grad_norm": 1.1632236242294312, |
|
"learning_rate": 2.5679597701149427e-05, |
|
"loss": 12.0218, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 12.819197654724121, |
|
"eval_runtime": 44.0917, |
|
"eval_samples_per_second": 29.597, |
|
"eval_steps_per_second": 3.72, |
|
"step": 50895 |
|
}, |
|
{ |
|
"epoch": 39.00383141762452, |
|
"grad_norm": 1.8346548080444336, |
|
"learning_rate": 2.5631704980842914e-05, |
|
"loss": 11.9914, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 39.08045977011494, |
|
"grad_norm": 1.3156729936599731, |
|
"learning_rate": 2.55838122605364e-05, |
|
"loss": 11.882, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 39.157088122605366, |
|
"grad_norm": 1.464136004447937, |
|
"learning_rate": 2.5535919540229885e-05, |
|
"loss": 12.0324, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 39.23371647509578, |
|
"grad_norm": 1.40706205368042, |
|
"learning_rate": 2.548802681992337e-05, |
|
"loss": 12.0355, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 39.310344827586206, |
|
"grad_norm": 1.1469753980636597, |
|
"learning_rate": 2.544013409961686e-05, |
|
"loss": 11.8437, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 39.38697318007663, |
|
"grad_norm": 2.110839605331421, |
|
"learning_rate": 2.5392241379310345e-05, |
|
"loss": 12.0156, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 39.46360153256705, |
|
"grad_norm": 1.0058891773223877, |
|
"learning_rate": 2.534434865900383e-05, |
|
"loss": 12.093, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 39.54022988505747, |
|
"grad_norm": 1.7903035879135132, |
|
"learning_rate": 2.5296455938697316e-05, |
|
"loss": 12.1111, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 39.616858237547895, |
|
"grad_norm": 1.7223442792892456, |
|
"learning_rate": 2.524856321839081e-05, |
|
"loss": 11.8909, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 39.69348659003831, |
|
"grad_norm": 1.6216609477996826, |
|
"learning_rate": 2.5200670498084293e-05, |
|
"loss": 12.0638, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 39.770114942528735, |
|
"grad_norm": 2.2488083839416504, |
|
"learning_rate": 2.515277777777778e-05, |
|
"loss": 12.193, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 39.84674329501916, |
|
"grad_norm": 1.9876821041107178, |
|
"learning_rate": 2.5104885057471267e-05, |
|
"loss": 11.9594, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 39.923371647509576, |
|
"grad_norm": 2.0479111671447754, |
|
"learning_rate": 2.5056992337164753e-05, |
|
"loss": 11.8695, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 2.512753486633301, |
|
"learning_rate": 2.500909961685824e-05, |
|
"loss": 11.9546, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 12.806585311889648, |
|
"eval_runtime": 44.0741, |
|
"eval_samples_per_second": 29.609, |
|
"eval_steps_per_second": 3.721, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 40.076628352490424, |
|
"grad_norm": 1.4184033870697021, |
|
"learning_rate": 2.4961206896551724e-05, |
|
"loss": 11.9875, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 40.15325670498084, |
|
"grad_norm": 2.1215152740478516, |
|
"learning_rate": 2.491331417624521e-05, |
|
"loss": 11.8898, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 40.229885057471265, |
|
"grad_norm": 1.5458124876022339, |
|
"learning_rate": 2.4865421455938698e-05, |
|
"loss": 12.2281, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 40.30651340996169, |
|
"grad_norm": 1.336580753326416, |
|
"learning_rate": 2.4817528735632184e-05, |
|
"loss": 11.743, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 40.383141762452105, |
|
"grad_norm": 1.1983288526535034, |
|
"learning_rate": 2.476963601532567e-05, |
|
"loss": 12.0526, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 40.45977011494253, |
|
"grad_norm": 3.6479368209838867, |
|
"learning_rate": 2.4721743295019158e-05, |
|
"loss": 11.9597, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 40.53639846743295, |
|
"grad_norm": 2.154127359390259, |
|
"learning_rate": 2.467432950191571e-05, |
|
"loss": 11.9651, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 40.61302681992337, |
|
"grad_norm": 1.476364016532898, |
|
"learning_rate": 2.4626436781609197e-05, |
|
"loss": 11.8092, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 40.689655172413794, |
|
"grad_norm": 1.9797921180725098, |
|
"learning_rate": 2.4578544061302684e-05, |
|
"loss": 12.1406, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 40.76628352490422, |
|
"grad_norm": 1.5220038890838623, |
|
"learning_rate": 2.453065134099617e-05, |
|
"loss": 11.8779, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 40.842911877394634, |
|
"grad_norm": 1.1830068826675415, |
|
"learning_rate": 2.4482758620689654e-05, |
|
"loss": 12.0007, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 40.91954022988506, |
|
"grad_norm": 1.3260859251022339, |
|
"learning_rate": 2.4434865900383144e-05, |
|
"loss": 12.1607, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 40.99616858237548, |
|
"grad_norm": 1.8781402111053467, |
|
"learning_rate": 2.438697318007663e-05, |
|
"loss": 11.9159, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_loss": 12.82541275024414, |
|
"eval_runtime": 44.0679, |
|
"eval_samples_per_second": 29.613, |
|
"eval_steps_per_second": 3.722, |
|
"step": 53505 |
|
}, |
|
{ |
|
"epoch": 41.0727969348659, |
|
"grad_norm": 3.089315891265869, |
|
"learning_rate": 2.4339080459770118e-05, |
|
"loss": 12.0552, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 41.14942528735632, |
|
"grad_norm": 1.9572243690490723, |
|
"learning_rate": 2.42911877394636e-05, |
|
"loss": 12.0124, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 41.22605363984675, |
|
"grad_norm": 1.6215753555297852, |
|
"learning_rate": 2.424329501915709e-05, |
|
"loss": 11.9782, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 41.30268199233716, |
|
"grad_norm": 1.3075189590454102, |
|
"learning_rate": 2.4195402298850575e-05, |
|
"loss": 12.2317, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 41.37931034482759, |
|
"grad_norm": 1.1214234828948975, |
|
"learning_rate": 2.4147509578544062e-05, |
|
"loss": 12.1511, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 41.45593869731801, |
|
"grad_norm": 8.386270523071289, |
|
"learning_rate": 2.409961685823755e-05, |
|
"loss": 11.8253, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 41.53256704980843, |
|
"grad_norm": 5.074198246002197, |
|
"learning_rate": 2.4051724137931036e-05, |
|
"loss": 12.0205, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 41.60919540229885, |
|
"grad_norm": 1.2190698385238647, |
|
"learning_rate": 2.4003831417624523e-05, |
|
"loss": 11.9438, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 41.68582375478927, |
|
"grad_norm": 1.3544102907180786, |
|
"learning_rate": 2.395593869731801e-05, |
|
"loss": 12.1235, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 41.76245210727969, |
|
"grad_norm": 1.080891489982605, |
|
"learning_rate": 2.3908045977011497e-05, |
|
"loss": 11.7676, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 41.839080459770116, |
|
"grad_norm": 1.453224539756775, |
|
"learning_rate": 2.386015325670498e-05, |
|
"loss": 12.0158, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 41.91570881226053, |
|
"grad_norm": 1.3428503274917603, |
|
"learning_rate": 2.3812260536398467e-05, |
|
"loss": 11.8066, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 41.99233716475096, |
|
"grad_norm": 1.3496088981628418, |
|
"learning_rate": 2.3764367816091957e-05, |
|
"loss": 11.8988, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_loss": 12.804805755615234, |
|
"eval_runtime": 44.1053, |
|
"eval_samples_per_second": 29.588, |
|
"eval_steps_per_second": 3.718, |
|
"step": 54810 |
|
}, |
|
{ |
|
"epoch": 42.06896551724138, |
|
"grad_norm": 1.2151437997817993, |
|
"learning_rate": 2.3716475095785444e-05, |
|
"loss": 12.1893, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 42.1455938697318, |
|
"grad_norm": 1.6184425354003906, |
|
"learning_rate": 2.3669061302681993e-05, |
|
"loss": 12.0546, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 42.22222222222222, |
|
"grad_norm": 1.6667332649230957, |
|
"learning_rate": 2.362116858237548e-05, |
|
"loss": 11.7933, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 42.298850574712645, |
|
"grad_norm": 3.835425615310669, |
|
"learning_rate": 2.3573275862068966e-05, |
|
"loss": 11.9275, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 42.37547892720306, |
|
"grad_norm": 4.450900554656982, |
|
"learning_rate": 2.3525383141762453e-05, |
|
"loss": 12.1853, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 42.452107279693486, |
|
"grad_norm": 1.4358230829238892, |
|
"learning_rate": 2.347749042145594e-05, |
|
"loss": 12.0228, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 42.52873563218391, |
|
"grad_norm": 1.6793595552444458, |
|
"learning_rate": 2.3429597701149427e-05, |
|
"loss": 11.9595, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 42.60536398467433, |
|
"grad_norm": 1.305600643157959, |
|
"learning_rate": 2.3381704980842914e-05, |
|
"loss": 11.8126, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 42.68199233716475, |
|
"grad_norm": 1.5794193744659424, |
|
"learning_rate": 2.33338122605364e-05, |
|
"loss": 12.0154, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 42.758620689655174, |
|
"grad_norm": 1.6401104927062988, |
|
"learning_rate": 2.3285919540229888e-05, |
|
"loss": 11.8344, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 42.83524904214559, |
|
"grad_norm": 1.6348859071731567, |
|
"learning_rate": 2.323802681992337e-05, |
|
"loss": 12.0174, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 42.911877394636015, |
|
"grad_norm": 2.6531448364257812, |
|
"learning_rate": 2.3190134099616858e-05, |
|
"loss": 11.8581, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 42.98850574712644, |
|
"grad_norm": 1.423274040222168, |
|
"learning_rate": 2.3142241379310345e-05, |
|
"loss": 11.9313, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_loss": 12.791069030761719, |
|
"eval_runtime": 44.1222, |
|
"eval_samples_per_second": 29.577, |
|
"eval_steps_per_second": 3.717, |
|
"step": 56115 |
|
}, |
|
{ |
|
"epoch": 43.065134099616856, |
|
"grad_norm": 1.3258931636810303, |
|
"learning_rate": 2.3094348659003835e-05, |
|
"loss": 11.8864, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 43.14176245210728, |
|
"grad_norm": 1.4615380764007568, |
|
"learning_rate": 2.304645593869732e-05, |
|
"loss": 12.0657, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 43.2183908045977, |
|
"grad_norm": 1.4611597061157227, |
|
"learning_rate": 2.2998563218390805e-05, |
|
"loss": 11.9148, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 43.29501915708812, |
|
"grad_norm": 1.7766637802124023, |
|
"learning_rate": 2.2950670498084292e-05, |
|
"loss": 12.0493, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 43.371647509578544, |
|
"grad_norm": 1.8123854398727417, |
|
"learning_rate": 2.290277777777778e-05, |
|
"loss": 11.8749, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 43.44827586206897, |
|
"grad_norm": 2.2500967979431152, |
|
"learning_rate": 2.2854885057471266e-05, |
|
"loss": 12.0237, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 43.524904214559385, |
|
"grad_norm": 1.44577157497406, |
|
"learning_rate": 2.280699233716475e-05, |
|
"loss": 11.8103, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 43.60153256704981, |
|
"grad_norm": 1.2959234714508057, |
|
"learning_rate": 2.275909961685824e-05, |
|
"loss": 12.1443, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 43.67816091954023, |
|
"grad_norm": 1.849253535270691, |
|
"learning_rate": 2.2711206896551727e-05, |
|
"loss": 12.037, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 43.75478927203065, |
|
"grad_norm": 1.46470046043396, |
|
"learning_rate": 2.266379310344828e-05, |
|
"loss": 12.0392, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 43.83141762452107, |
|
"grad_norm": 1.7397308349609375, |
|
"learning_rate": 2.2615900383141765e-05, |
|
"loss": 11.8446, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 43.9080459770115, |
|
"grad_norm": 1.1144057512283325, |
|
"learning_rate": 2.256800766283525e-05, |
|
"loss": 12.0084, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 43.984674329501914, |
|
"grad_norm": 4.426650047302246, |
|
"learning_rate": 2.2520114942528736e-05, |
|
"loss": 12.0514, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_loss": 12.808330535888672, |
|
"eval_runtime": 44.0792, |
|
"eval_samples_per_second": 29.606, |
|
"eval_steps_per_second": 3.721, |
|
"step": 57420 |
|
}, |
|
{ |
|
"epoch": 44.06130268199234, |
|
"grad_norm": 1.1355741024017334, |
|
"learning_rate": 2.2472222222222223e-05, |
|
"loss": 11.9243, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 44.13793103448276, |
|
"grad_norm": 1.5547679662704468, |
|
"learning_rate": 2.2424329501915713e-05, |
|
"loss": 12.0711, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 44.21455938697318, |
|
"grad_norm": 1.5729808807373047, |
|
"learning_rate": 2.2376436781609196e-05, |
|
"loss": 11.9867, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 44.2911877394636, |
|
"grad_norm": 1.2912790775299072, |
|
"learning_rate": 2.2328544061302683e-05, |
|
"loss": 11.8632, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 44.367816091954026, |
|
"grad_norm": 1.2545444965362549, |
|
"learning_rate": 2.228065134099617e-05, |
|
"loss": 12.0665, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 44.44444444444444, |
|
"grad_norm": 1.3165549039840698, |
|
"learning_rate": 2.2232758620689657e-05, |
|
"loss": 11.842, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 44.52107279693487, |
|
"grad_norm": 1.7680951356887817, |
|
"learning_rate": 2.218486590038314e-05, |
|
"loss": 11.8055, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 44.59770114942529, |
|
"grad_norm": 2.2426023483276367, |
|
"learning_rate": 2.2136973180076627e-05, |
|
"loss": 12.1153, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 44.67432950191571, |
|
"grad_norm": 0.9581509828567505, |
|
"learning_rate": 2.2089080459770118e-05, |
|
"loss": 11.8089, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 44.75095785440613, |
|
"grad_norm": 2.1268539428710938, |
|
"learning_rate": 2.2041187739463605e-05, |
|
"loss": 11.8902, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 44.827586206896555, |
|
"grad_norm": 1.2000526189804077, |
|
"learning_rate": 2.1993295019157088e-05, |
|
"loss": 11.8651, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 44.90421455938697, |
|
"grad_norm": 2.349942684173584, |
|
"learning_rate": 2.1945402298850575e-05, |
|
"loss": 11.9236, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 44.980842911877396, |
|
"grad_norm": 1.639948844909668, |
|
"learning_rate": 2.1897509578544062e-05, |
|
"loss": 11.9533, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_loss": 12.792840003967285, |
|
"eval_runtime": 44.0555, |
|
"eval_samples_per_second": 29.622, |
|
"eval_steps_per_second": 3.723, |
|
"step": 58725 |
|
}, |
|
{ |
|
"epoch": 45.05747126436781, |
|
"grad_norm": 0.9822871088981628, |
|
"learning_rate": 2.184961685823755e-05, |
|
"loss": 11.9065, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 45.13409961685824, |
|
"grad_norm": 5.536319255828857, |
|
"learning_rate": 2.1801724137931036e-05, |
|
"loss": 11.9411, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 45.21072796934866, |
|
"grad_norm": 1.8267079591751099, |
|
"learning_rate": 2.1753831417624522e-05, |
|
"loss": 11.8592, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 45.28735632183908, |
|
"grad_norm": 1.453710675239563, |
|
"learning_rate": 2.170593869731801e-05, |
|
"loss": 12.246, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 45.3639846743295, |
|
"grad_norm": 1.5747921466827393, |
|
"learning_rate": 2.1658045977011496e-05, |
|
"loss": 12.1555, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 45.440613026819925, |
|
"grad_norm": 0.9929379224777222, |
|
"learning_rate": 2.1610153256704983e-05, |
|
"loss": 11.7682, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 45.51724137931034, |
|
"grad_norm": 1.4931187629699707, |
|
"learning_rate": 2.1562260536398467e-05, |
|
"loss": 11.8555, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 45.593869731800766, |
|
"grad_norm": 1.114998459815979, |
|
"learning_rate": 2.1514367816091953e-05, |
|
"loss": 11.8726, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 45.67049808429119, |
|
"grad_norm": 1.7308725118637085, |
|
"learning_rate": 2.146647509578544e-05, |
|
"loss": 12.0875, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 45.747126436781606, |
|
"grad_norm": 1.1630358695983887, |
|
"learning_rate": 2.141858237547893e-05, |
|
"loss": 11.8994, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 45.82375478927203, |
|
"grad_norm": 1.9863486289978027, |
|
"learning_rate": 2.1370689655172414e-05, |
|
"loss": 11.9502, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 45.900383141762454, |
|
"grad_norm": 1.3612456321716309, |
|
"learning_rate": 2.13227969348659e-05, |
|
"loss": 11.8048, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 45.97701149425287, |
|
"grad_norm": 1.1734110116958618, |
|
"learning_rate": 2.1274904214559388e-05, |
|
"loss": 12.1155, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_loss": 12.802705764770508, |
|
"eval_runtime": 44.0902, |
|
"eval_samples_per_second": 29.598, |
|
"eval_steps_per_second": 3.72, |
|
"step": 60030 |
|
}, |
|
{ |
|
"epoch": 46.053639846743295, |
|
"grad_norm": 2.19791841506958, |
|
"learning_rate": 2.1227011494252875e-05, |
|
"loss": 12.0121, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 46.13026819923372, |
|
"grad_norm": 3.206514358520508, |
|
"learning_rate": 2.1179597701149426e-05, |
|
"loss": 11.9131, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 46.206896551724135, |
|
"grad_norm": 1.2101006507873535, |
|
"learning_rate": 2.1131704980842913e-05, |
|
"loss": 11.869, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 46.28352490421456, |
|
"grad_norm": 1.3384582996368408, |
|
"learning_rate": 2.10838122605364e-05, |
|
"loss": 11.7608, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 46.36015325670498, |
|
"grad_norm": 3.215064764022827, |
|
"learning_rate": 2.1035919540229887e-05, |
|
"loss": 12.0495, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 46.4367816091954, |
|
"grad_norm": 1.26254403591156, |
|
"learning_rate": 2.0988026819923374e-05, |
|
"loss": 11.8855, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 46.513409961685824, |
|
"grad_norm": 1.139722466468811, |
|
"learning_rate": 2.094013409961686e-05, |
|
"loss": 12.0157, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 46.59003831417625, |
|
"grad_norm": 1.9146323204040527, |
|
"learning_rate": 2.0892241379310344e-05, |
|
"loss": 11.8276, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"grad_norm": 1.6539549827575684, |
|
"learning_rate": 2.084434865900383e-05, |
|
"loss": 11.9677, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 46.74329501915709, |
|
"grad_norm": 1.2380534410476685, |
|
"learning_rate": 2.0796455938697318e-05, |
|
"loss": 12.0291, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 46.81992337164751, |
|
"grad_norm": 1.8375437259674072, |
|
"learning_rate": 2.074856321839081e-05, |
|
"loss": 11.9032, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 46.89655172413793, |
|
"grad_norm": 2.2188262939453125, |
|
"learning_rate": 2.0700670498084292e-05, |
|
"loss": 12.0465, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 46.97318007662835, |
|
"grad_norm": 1.1582258939743042, |
|
"learning_rate": 2.065277777777778e-05, |
|
"loss": 11.924, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_loss": 12.797731399536133, |
|
"eval_runtime": 44.1559, |
|
"eval_samples_per_second": 29.554, |
|
"eval_steps_per_second": 3.714, |
|
"step": 61335 |
|
}, |
|
{ |
|
"epoch": 47.04980842911878, |
|
"grad_norm": 3.067289352416992, |
|
"learning_rate": 2.0604885057471266e-05, |
|
"loss": 11.8265, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 47.12643678160919, |
|
"grad_norm": 1.3472516536712646, |
|
"learning_rate": 2.0556992337164752e-05, |
|
"loss": 11.8763, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 47.20306513409962, |
|
"grad_norm": 1.4235740900039673, |
|
"learning_rate": 2.050909961685824e-05, |
|
"loss": 11.9473, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 47.27969348659004, |
|
"grad_norm": 1.3170359134674072, |
|
"learning_rate": 2.0461206896551723e-05, |
|
"loss": 11.9381, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 47.35632183908046, |
|
"grad_norm": 1.6014246940612793, |
|
"learning_rate": 2.0413314176245213e-05, |
|
"loss": 11.9074, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 47.43295019157088, |
|
"grad_norm": 1.3270535469055176, |
|
"learning_rate": 2.03654214559387e-05, |
|
"loss": 11.9903, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 47.509578544061306, |
|
"grad_norm": 1.1905503273010254, |
|
"learning_rate": 2.0317528735632187e-05, |
|
"loss": 11.9629, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 47.58620689655172, |
|
"grad_norm": 1.546738862991333, |
|
"learning_rate": 2.026963601532567e-05, |
|
"loss": 11.831, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 47.662835249042146, |
|
"grad_norm": 1.5887172222137451, |
|
"learning_rate": 2.0221743295019157e-05, |
|
"loss": 12.0534, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 47.73946360153257, |
|
"grad_norm": 1.3189942836761475, |
|
"learning_rate": 2.0173850574712644e-05, |
|
"loss": 11.9131, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 47.81609195402299, |
|
"grad_norm": 1.9591014385223389, |
|
"learning_rate": 2.012595785440613e-05, |
|
"loss": 11.8583, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 47.89272030651341, |
|
"grad_norm": 1.6344765424728394, |
|
"learning_rate": 2.0078065134099618e-05, |
|
"loss": 11.9921, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 47.969348659003835, |
|
"grad_norm": 1.1810266971588135, |
|
"learning_rate": 2.0030172413793105e-05, |
|
"loss": 11.9987, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 12.767735481262207, |
|
"eval_runtime": 44.144, |
|
"eval_samples_per_second": 29.562, |
|
"eval_steps_per_second": 3.715, |
|
"step": 62640 |
|
}, |
|
{ |
|
"epoch": 48.04597701149425, |
|
"grad_norm": 1.4370075464248657, |
|
"learning_rate": 1.998227969348659e-05, |
|
"loss": 12.0014, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 48.122605363984675, |
|
"grad_norm": 1.2901791334152222, |
|
"learning_rate": 1.993438697318008e-05, |
|
"loss": 12.0385, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 48.1992337164751, |
|
"grad_norm": 1.2324562072753906, |
|
"learning_rate": 1.9886494252873565e-05, |
|
"loss": 11.9594, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 48.275862068965516, |
|
"grad_norm": 1.40041983127594, |
|
"learning_rate": 1.983860153256705e-05, |
|
"loss": 11.76, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 48.35249042145594, |
|
"grad_norm": 1.5981560945510864, |
|
"learning_rate": 1.9790708812260536e-05, |
|
"loss": 11.8416, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 48.42911877394636, |
|
"grad_norm": 1.5366255044937134, |
|
"learning_rate": 1.974329501915709e-05, |
|
"loss": 11.9168, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 48.50574712643678, |
|
"grad_norm": 2.1091346740722656, |
|
"learning_rate": 1.9695402298850578e-05, |
|
"loss": 11.7809, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 48.582375478927204, |
|
"grad_norm": 3.076678991317749, |
|
"learning_rate": 1.964750957854406e-05, |
|
"loss": 11.8881, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 48.65900383141762, |
|
"grad_norm": 1.6555073261260986, |
|
"learning_rate": 1.9599616858237548e-05, |
|
"loss": 11.6799, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 48.735632183908045, |
|
"grad_norm": 1.2696727514266968, |
|
"learning_rate": 1.9551724137931035e-05, |
|
"loss": 12.0306, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 48.81226053639847, |
|
"grad_norm": 1.739827275276184, |
|
"learning_rate": 1.9503831417624522e-05, |
|
"loss": 12.1005, |
|
"step": 63700 |
|
}, |
|
{ |
|
"epoch": 48.888888888888886, |
|
"grad_norm": 1.187231421470642, |
|
"learning_rate": 1.945593869731801e-05, |
|
"loss": 11.9703, |
|
"step": 63800 |
|
}, |
|
{ |
|
"epoch": 48.96551724137931, |
|
"grad_norm": 2.756282091140747, |
|
"learning_rate": 1.9408045977011496e-05, |
|
"loss": 12.0693, |
|
"step": 63900 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_loss": 12.775006294250488, |
|
"eval_runtime": 44.1249, |
|
"eval_samples_per_second": 29.575, |
|
"eval_steps_per_second": 3.717, |
|
"step": 63945 |
|
}, |
|
{ |
|
"epoch": 49.04214559386973, |
|
"grad_norm": 0.967854917049408, |
|
"learning_rate": 1.9360153256704983e-05, |
|
"loss": 11.9437, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 49.11877394636015, |
|
"grad_norm": 1.2055004835128784, |
|
"learning_rate": 1.931226053639847e-05, |
|
"loss": 11.9037, |
|
"step": 64100 |
|
}, |
|
{ |
|
"epoch": 49.195402298850574, |
|
"grad_norm": 1.6203746795654297, |
|
"learning_rate": 1.9264367816091956e-05, |
|
"loss": 11.9823, |
|
"step": 64200 |
|
}, |
|
{ |
|
"epoch": 49.272030651341, |
|
"grad_norm": 1.1399292945861816, |
|
"learning_rate": 1.921647509578544e-05, |
|
"loss": 12.0721, |
|
"step": 64300 |
|
}, |
|
{ |
|
"epoch": 49.348659003831415, |
|
"grad_norm": 1.3431105613708496, |
|
"learning_rate": 1.9168582375478927e-05, |
|
"loss": 11.8897, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 49.42528735632184, |
|
"grad_norm": 1.316723346710205, |
|
"learning_rate": 1.9120689655172414e-05, |
|
"loss": 11.9025, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 49.50191570881226, |
|
"grad_norm": 1.8449369668960571, |
|
"learning_rate": 1.9072796934865904e-05, |
|
"loss": 11.6683, |
|
"step": 64600 |
|
}, |
|
{ |
|
"epoch": 49.57854406130268, |
|
"grad_norm": 1.3772321939468384, |
|
"learning_rate": 1.9024904214559387e-05, |
|
"loss": 12.2022, |
|
"step": 64700 |
|
}, |
|
{ |
|
"epoch": 49.6551724137931, |
|
"grad_norm": 2.2538058757781982, |
|
"learning_rate": 1.8977011494252874e-05, |
|
"loss": 11.8425, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 49.73180076628353, |
|
"grad_norm": 2.1310970783233643, |
|
"learning_rate": 1.892911877394636e-05, |
|
"loss": 11.9638, |
|
"step": 64900 |
|
}, |
|
{ |
|
"epoch": 49.808429118773944, |
|
"grad_norm": 1.2570499181747437, |
|
"learning_rate": 1.8881226053639848e-05, |
|
"loss": 12.0367, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 49.88505747126437, |
|
"grad_norm": 1.6000453233718872, |
|
"learning_rate": 1.8833333333333335e-05, |
|
"loss": 12.0249, |
|
"step": 65100 |
|
}, |
|
{ |
|
"epoch": 49.96168582375479, |
|
"grad_norm": 1.2556895017623901, |
|
"learning_rate": 1.878544061302682e-05, |
|
"loss": 11.9285, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 12.788679122924805, |
|
"eval_runtime": 44.0734, |
|
"eval_samples_per_second": 29.61, |
|
"eval_steps_per_second": 3.721, |
|
"step": 65250 |
|
}, |
|
{ |
|
"epoch": 50.03831417624521, |
|
"grad_norm": 1.4611543416976929, |
|
"learning_rate": 1.873754789272031e-05, |
|
"loss": 12.0139, |
|
"step": 65300 |
|
}, |
|
{ |
|
"epoch": 50.11494252873563, |
|
"grad_norm": 1.3939285278320312, |
|
"learning_rate": 1.869013409961686e-05, |
|
"loss": 12.1466, |
|
"step": 65400 |
|
}, |
|
{ |
|
"epoch": 50.191570881226056, |
|
"grad_norm": 1.378446102142334, |
|
"learning_rate": 1.8642241379310347e-05, |
|
"loss": 12.0221, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 50.26819923371647, |
|
"grad_norm": 1.1458476781845093, |
|
"learning_rate": 1.859434865900383e-05, |
|
"loss": 11.98, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 50.3448275862069, |
|
"grad_norm": 1.2113792896270752, |
|
"learning_rate": 1.8546455938697318e-05, |
|
"loss": 11.7938, |
|
"step": 65700 |
|
}, |
|
{ |
|
"epoch": 50.42145593869732, |
|
"grad_norm": 3.7647705078125, |
|
"learning_rate": 1.8498563218390804e-05, |
|
"loss": 12.046, |
|
"step": 65800 |
|
}, |
|
{ |
|
"epoch": 50.49808429118774, |
|
"grad_norm": 1.4086334705352783, |
|
"learning_rate": 1.845067049808429e-05, |
|
"loss": 12.0137, |
|
"step": 65900 |
|
}, |
|
{ |
|
"epoch": 50.57471264367816, |
|
"grad_norm": 2.212301254272461, |
|
"learning_rate": 1.8402777777777778e-05, |
|
"loss": 11.8535, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 50.651340996168585, |
|
"grad_norm": 1.1334259510040283, |
|
"learning_rate": 1.8354885057471265e-05, |
|
"loss": 11.7534, |
|
"step": 66100 |
|
}, |
|
{ |
|
"epoch": 50.727969348659, |
|
"grad_norm": 1.3607604503631592, |
|
"learning_rate": 1.8306992337164752e-05, |
|
"loss": 12.1351, |
|
"step": 66200 |
|
}, |
|
{ |
|
"epoch": 50.804597701149426, |
|
"grad_norm": 0.9516454935073853, |
|
"learning_rate": 1.825909961685824e-05, |
|
"loss": 11.8739, |
|
"step": 66300 |
|
}, |
|
{ |
|
"epoch": 50.88122605363985, |
|
"grad_norm": 1.7874857187271118, |
|
"learning_rate": 1.8211206896551726e-05, |
|
"loss": 12.0046, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 50.95785440613027, |
|
"grad_norm": 1.1303731203079224, |
|
"learning_rate": 1.816331417624521e-05, |
|
"loss": 11.8135, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_loss": 12.762798309326172, |
|
"eval_runtime": 44.1477, |
|
"eval_samples_per_second": 29.56, |
|
"eval_steps_per_second": 3.715, |
|
"step": 66555 |
|
}, |
|
{ |
|
"epoch": 51.03448275862069, |
|
"grad_norm": 2.8881723880767822, |
|
"learning_rate": 1.8115421455938696e-05, |
|
"loss": 11.8533, |
|
"step": 66600 |
|
}, |
|
{ |
|
"epoch": 51.111111111111114, |
|
"grad_norm": 1.2278690338134766, |
|
"learning_rate": 1.8067528735632186e-05, |
|
"loss": 11.9214, |
|
"step": 66700 |
|
}, |
|
{ |
|
"epoch": 51.18773946360153, |
|
"grad_norm": 1.9933656454086304, |
|
"learning_rate": 1.8019636015325673e-05, |
|
"loss": 11.8527, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 51.264367816091955, |
|
"grad_norm": 1.4205143451690674, |
|
"learning_rate": 1.7971743295019157e-05, |
|
"loss": 12.0251, |
|
"step": 66900 |
|
}, |
|
{ |
|
"epoch": 51.34099616858238, |
|
"grad_norm": 1.319817304611206, |
|
"learning_rate": 1.7923850574712644e-05, |
|
"loss": 12.0983, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 51.417624521072796, |
|
"grad_norm": 1.6209360361099243, |
|
"learning_rate": 1.787595785440613e-05, |
|
"loss": 11.8053, |
|
"step": 67100 |
|
}, |
|
{ |
|
"epoch": 51.49425287356322, |
|
"grad_norm": 1.0465126037597656, |
|
"learning_rate": 1.7828065134099617e-05, |
|
"loss": 12.0158, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 51.57088122605364, |
|
"grad_norm": 1.4087551832199097, |
|
"learning_rate": 1.7780172413793104e-05, |
|
"loss": 11.9305, |
|
"step": 67300 |
|
}, |
|
{ |
|
"epoch": 51.64750957854406, |
|
"grad_norm": 1.121779203414917, |
|
"learning_rate": 1.773227969348659e-05, |
|
"loss": 12.1881, |
|
"step": 67400 |
|
}, |
|
{ |
|
"epoch": 51.724137931034484, |
|
"grad_norm": 1.5989633798599243, |
|
"learning_rate": 1.7684386973180078e-05, |
|
"loss": 11.9698, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 51.8007662835249, |
|
"grad_norm": 1.1244069337844849, |
|
"learning_rate": 1.7636494252873565e-05, |
|
"loss": 11.7475, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 51.877394636015325, |
|
"grad_norm": 1.2594223022460938, |
|
"learning_rate": 1.7589080459770117e-05, |
|
"loss": 11.9611, |
|
"step": 67700 |
|
}, |
|
{ |
|
"epoch": 51.95402298850575, |
|
"grad_norm": 1.6870946884155273, |
|
"learning_rate": 1.7541187739463604e-05, |
|
"loss": 11.7075, |
|
"step": 67800 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_loss": 12.814347267150879, |
|
"eval_runtime": 44.0743, |
|
"eval_samples_per_second": 29.609, |
|
"eval_steps_per_second": 3.721, |
|
"step": 67860 |
|
}, |
|
{ |
|
"epoch": 52.030651340996165, |
|
"grad_norm": 1.1319911479949951, |
|
"learning_rate": 1.7493295019157087e-05, |
|
"loss": 11.8327, |
|
"step": 67900 |
|
}, |
|
{ |
|
"epoch": 52.10727969348659, |
|
"grad_norm": 1.0522786378860474, |
|
"learning_rate": 1.7445402298850574e-05, |
|
"loss": 11.858, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 52.18390804597701, |
|
"grad_norm": 1.7333852052688599, |
|
"learning_rate": 1.7397509578544064e-05, |
|
"loss": 12.037, |
|
"step": 68100 |
|
}, |
|
{ |
|
"epoch": 52.26053639846743, |
|
"grad_norm": 1.7924898862838745, |
|
"learning_rate": 1.734961685823755e-05, |
|
"loss": 12.0778, |
|
"step": 68200 |
|
}, |
|
{ |
|
"epoch": 52.337164750957854, |
|
"grad_norm": 1.221550464630127, |
|
"learning_rate": 1.7301724137931035e-05, |
|
"loss": 12.1365, |
|
"step": 68300 |
|
}, |
|
{ |
|
"epoch": 52.41379310344828, |
|
"grad_norm": 1.6241466999053955, |
|
"learning_rate": 1.725383141762452e-05, |
|
"loss": 12.0267, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 52.490421455938694, |
|
"grad_norm": 1.7579493522644043, |
|
"learning_rate": 1.720593869731801e-05, |
|
"loss": 11.7834, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 52.56704980842912, |
|
"grad_norm": 1.4909967184066772, |
|
"learning_rate": 1.7158045977011495e-05, |
|
"loss": 11.9632, |
|
"step": 68600 |
|
}, |
|
{ |
|
"epoch": 52.64367816091954, |
|
"grad_norm": 2.0708203315734863, |
|
"learning_rate": 1.7110153256704982e-05, |
|
"loss": 11.9318, |
|
"step": 68700 |
|
}, |
|
{ |
|
"epoch": 52.72030651340996, |
|
"grad_norm": 1.1900310516357422, |
|
"learning_rate": 1.706226053639847e-05, |
|
"loss": 11.8145, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 52.79693486590038, |
|
"grad_norm": 1.2245934009552002, |
|
"learning_rate": 1.7014367816091956e-05, |
|
"loss": 11.6663, |
|
"step": 68900 |
|
}, |
|
{ |
|
"epoch": 52.87356321839081, |
|
"grad_norm": 1.6178796291351318, |
|
"learning_rate": 1.6966475095785443e-05, |
|
"loss": 11.9844, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 52.95019157088122, |
|
"grad_norm": 1.2077674865722656, |
|
"learning_rate": 1.691858237547893e-05, |
|
"loss": 11.6393, |
|
"step": 69100 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_loss": 12.77491283416748, |
|
"eval_runtime": 44.1152, |
|
"eval_samples_per_second": 29.582, |
|
"eval_steps_per_second": 3.718, |
|
"step": 69165 |
|
}, |
|
{ |
|
"epoch": 53.02681992337165, |
|
"grad_norm": 1.2087703943252563, |
|
"learning_rate": 1.6870689655172413e-05, |
|
"loss": 11.8316, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 53.10344827586207, |
|
"grad_norm": 1.472959280014038, |
|
"learning_rate": 1.68227969348659e-05, |
|
"loss": 11.9068, |
|
"step": 69300 |
|
}, |
|
{ |
|
"epoch": 53.18007662835249, |
|
"grad_norm": 1.2973859310150146, |
|
"learning_rate": 1.6774904214559387e-05, |
|
"loss": 11.8753, |
|
"step": 69400 |
|
}, |
|
{ |
|
"epoch": 53.25670498084291, |
|
"grad_norm": 1.3909817934036255, |
|
"learning_rate": 1.6727011494252877e-05, |
|
"loss": 11.6868, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 1.1226869821548462, |
|
"learning_rate": 1.667911877394636e-05, |
|
"loss": 11.7399, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 53.40996168582375, |
|
"grad_norm": 1.6086245775222778, |
|
"learning_rate": 1.6631226053639847e-05, |
|
"loss": 11.9871, |
|
"step": 69700 |
|
}, |
|
{ |
|
"epoch": 53.486590038314176, |
|
"grad_norm": 5.143097400665283, |
|
"learning_rate": 1.65838122605364e-05, |
|
"loss": 12.0991, |
|
"step": 69800 |
|
}, |
|
{ |
|
"epoch": 53.5632183908046, |
|
"grad_norm": 1.1883777379989624, |
|
"learning_rate": 1.6535919540229886e-05, |
|
"loss": 11.7275, |
|
"step": 69900 |
|
}, |
|
{ |
|
"epoch": 53.63984674329502, |
|
"grad_norm": 1.152468204498291, |
|
"learning_rate": 1.6488026819923373e-05, |
|
"loss": 11.9268, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 53.71647509578544, |
|
"grad_norm": 1.6981552839279175, |
|
"learning_rate": 1.6440134099616856e-05, |
|
"loss": 12.0293, |
|
"step": 70100 |
|
}, |
|
{ |
|
"epoch": 53.793103448275865, |
|
"grad_norm": 1.6067506074905396, |
|
"learning_rate": 1.6392241379310347e-05, |
|
"loss": 11.9477, |
|
"step": 70200 |
|
}, |
|
{ |
|
"epoch": 53.86973180076628, |
|
"grad_norm": 3.569709539413452, |
|
"learning_rate": 1.6344348659003834e-05, |
|
"loss": 11.8055, |
|
"step": 70300 |
|
}, |
|
{ |
|
"epoch": 53.946360153256705, |
|
"grad_norm": 2.3322157859802246, |
|
"learning_rate": 1.629645593869732e-05, |
|
"loss": 12.027, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_loss": 12.753838539123535, |
|
"eval_runtime": 44.1081, |
|
"eval_samples_per_second": 29.586, |
|
"eval_steps_per_second": 3.718, |
|
"step": 70470 |
|
}, |
|
{ |
|
"epoch": 54.02298850574713, |
|
"grad_norm": 1.4370397329330444, |
|
"learning_rate": 1.6248563218390804e-05, |
|
"loss": 12.0639, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 54.099616858237546, |
|
"grad_norm": 2.486645221710205, |
|
"learning_rate": 1.620067049808429e-05, |
|
"loss": 11.9231, |
|
"step": 70600 |
|
}, |
|
{ |
|
"epoch": 54.17624521072797, |
|
"grad_norm": 2.0936434268951416, |
|
"learning_rate": 1.6152777777777778e-05, |
|
"loss": 11.9161, |
|
"step": 70700 |
|
}, |
|
{ |
|
"epoch": 54.252873563218394, |
|
"grad_norm": 1.5211490392684937, |
|
"learning_rate": 1.6104885057471265e-05, |
|
"loss": 11.9338, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 54.32950191570881, |
|
"grad_norm": 1.035090684890747, |
|
"learning_rate": 1.605699233716475e-05, |
|
"loss": 11.7872, |
|
"step": 70900 |
|
}, |
|
{ |
|
"epoch": 54.406130268199234, |
|
"grad_norm": 1.617077112197876, |
|
"learning_rate": 1.600909961685824e-05, |
|
"loss": 11.9772, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 54.48275862068966, |
|
"grad_norm": 1.3988826274871826, |
|
"learning_rate": 1.5961206896551725e-05, |
|
"loss": 12.0088, |
|
"step": 71100 |
|
}, |
|
{ |
|
"epoch": 54.559386973180075, |
|
"grad_norm": 1.7126933336257935, |
|
"learning_rate": 1.5913314176245212e-05, |
|
"loss": 11.9831, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 54.6360153256705, |
|
"grad_norm": 2.3251850605010986, |
|
"learning_rate": 1.58654214559387e-05, |
|
"loss": 11.7345, |
|
"step": 71300 |
|
}, |
|
{ |
|
"epoch": 54.71264367816092, |
|
"grad_norm": 1.6456447839736938, |
|
"learning_rate": 1.5817528735632183e-05, |
|
"loss": 12.0158, |
|
"step": 71400 |
|
}, |
|
{ |
|
"epoch": 54.78927203065134, |
|
"grad_norm": 2.1808829307556152, |
|
"learning_rate": 1.576963601532567e-05, |
|
"loss": 12.0169, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 54.86590038314176, |
|
"grad_norm": 2.2233774662017822, |
|
"learning_rate": 1.572174329501916e-05, |
|
"loss": 11.9144, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 54.94252873563218, |
|
"grad_norm": 1.5419303178787231, |
|
"learning_rate": 1.5673850574712647e-05, |
|
"loss": 11.7915, |
|
"step": 71700 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_loss": 12.735248565673828, |
|
"eval_runtime": 44.0526, |
|
"eval_samples_per_second": 29.624, |
|
"eval_steps_per_second": 3.723, |
|
"step": 71775 |
|
}, |
|
{ |
|
"epoch": 55.019157088122604, |
|
"grad_norm": 2.4967896938323975, |
|
"learning_rate": 1.562595785440613e-05, |
|
"loss": 12.1777, |
|
"step": 71800 |
|
}, |
|
{ |
|
"epoch": 55.09578544061303, |
|
"grad_norm": 1.6103179454803467, |
|
"learning_rate": 1.5578065134099617e-05, |
|
"loss": 12.0236, |
|
"step": 71900 |
|
}, |
|
{ |
|
"epoch": 55.172413793103445, |
|
"grad_norm": 1.058643102645874, |
|
"learning_rate": 1.553065134099617e-05, |
|
"loss": 11.9485, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 55.24904214559387, |
|
"grad_norm": 1.1860133409500122, |
|
"learning_rate": 1.5482758620689656e-05, |
|
"loss": 11.7885, |
|
"step": 72100 |
|
}, |
|
{ |
|
"epoch": 55.32567049808429, |
|
"grad_norm": 2.6516213417053223, |
|
"learning_rate": 1.5434865900383142e-05, |
|
"loss": 11.8373, |
|
"step": 72200 |
|
}, |
|
{ |
|
"epoch": 55.40229885057471, |
|
"grad_norm": 1.3108186721801758, |
|
"learning_rate": 1.538697318007663e-05, |
|
"loss": 11.8938, |
|
"step": 72300 |
|
}, |
|
{ |
|
"epoch": 55.47892720306513, |
|
"grad_norm": 2.721954345703125, |
|
"learning_rate": 1.5339080459770116e-05, |
|
"loss": 11.873, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 55.55555555555556, |
|
"grad_norm": 1.0352996587753296, |
|
"learning_rate": 1.5291187739463603e-05, |
|
"loss": 12.025, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 55.632183908045974, |
|
"grad_norm": 1.258169412612915, |
|
"learning_rate": 1.5243295019157088e-05, |
|
"loss": 11.9444, |
|
"step": 72600 |
|
}, |
|
{ |
|
"epoch": 55.7088122605364, |
|
"grad_norm": 2.314866781234741, |
|
"learning_rate": 1.5195402298850575e-05, |
|
"loss": 11.711, |
|
"step": 72700 |
|
}, |
|
{ |
|
"epoch": 55.78544061302682, |
|
"grad_norm": 1.308590292930603, |
|
"learning_rate": 1.5147509578544062e-05, |
|
"loss": 12.0446, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 55.86206896551724, |
|
"grad_norm": 2.928891897201538, |
|
"learning_rate": 1.5099616858237547e-05, |
|
"loss": 11.9413, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 55.93869731800766, |
|
"grad_norm": 1.048743724822998, |
|
"learning_rate": 1.5051724137931036e-05, |
|
"loss": 11.791, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_loss": 12.750344276428223, |
|
"eval_runtime": 44.088, |
|
"eval_samples_per_second": 29.6, |
|
"eval_steps_per_second": 3.72, |
|
"step": 73080 |
|
}, |
|
{ |
|
"epoch": 56.015325670498086, |
|
"grad_norm": 3.6337478160858154, |
|
"learning_rate": 1.5003831417624523e-05, |
|
"loss": 11.9951, |
|
"step": 73100 |
|
}, |
|
{ |
|
"epoch": 56.0919540229885, |
|
"grad_norm": 1.7665445804595947, |
|
"learning_rate": 1.495593869731801e-05, |
|
"loss": 12.1332, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 56.16858237547893, |
|
"grad_norm": 1.4894465208053589, |
|
"learning_rate": 1.4908045977011495e-05, |
|
"loss": 11.7198, |
|
"step": 73300 |
|
}, |
|
{ |
|
"epoch": 56.24521072796935, |
|
"grad_norm": 1.0169578790664673, |
|
"learning_rate": 1.4860153256704982e-05, |
|
"loss": 12.0523, |
|
"step": 73400 |
|
}, |
|
{ |
|
"epoch": 56.32183908045977, |
|
"grad_norm": 1.2872236967086792, |
|
"learning_rate": 1.4812260536398467e-05, |
|
"loss": 11.8438, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 56.39846743295019, |
|
"grad_norm": 1.1032931804656982, |
|
"learning_rate": 1.4764367816091954e-05, |
|
"loss": 11.9058, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 56.475095785440615, |
|
"grad_norm": 1.4371570348739624, |
|
"learning_rate": 1.4716475095785442e-05, |
|
"loss": 11.9199, |
|
"step": 73700 |
|
}, |
|
{ |
|
"epoch": 56.55172413793103, |
|
"grad_norm": 1.9667787551879883, |
|
"learning_rate": 1.4668582375478929e-05, |
|
"loss": 11.899, |
|
"step": 73800 |
|
}, |
|
{ |
|
"epoch": 56.628352490421456, |
|
"grad_norm": 1.2465131282806396, |
|
"learning_rate": 1.4620689655172414e-05, |
|
"loss": 11.9303, |
|
"step": 73900 |
|
}, |
|
{ |
|
"epoch": 56.70498084291188, |
|
"grad_norm": 1.2738486528396606, |
|
"learning_rate": 1.4573275862068966e-05, |
|
"loss": 11.9897, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 56.7816091954023, |
|
"grad_norm": 1.7295411825180054, |
|
"learning_rate": 1.4525383141762453e-05, |
|
"loss": 11.989, |
|
"step": 74100 |
|
}, |
|
{ |
|
"epoch": 56.85823754789272, |
|
"grad_norm": 3.2072668075561523, |
|
"learning_rate": 1.4477490421455938e-05, |
|
"loss": 11.8107, |
|
"step": 74200 |
|
}, |
|
{ |
|
"epoch": 56.934865900383144, |
|
"grad_norm": 1.3828212022781372, |
|
"learning_rate": 1.4429597701149425e-05, |
|
"loss": 11.7899, |
|
"step": 74300 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_loss": 12.746719360351562, |
|
"eval_runtime": 44.0689, |
|
"eval_samples_per_second": 29.613, |
|
"eval_steps_per_second": 3.721, |
|
"step": 74385 |
|
}, |
|
{ |
|
"epoch": 57.01149425287356, |
|
"grad_norm": 1.1235148906707764, |
|
"learning_rate": 1.4381704980842914e-05, |
|
"loss": 11.9095, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 57.088122605363985, |
|
"grad_norm": 1.3013513088226318, |
|
"learning_rate": 1.43338122605364e-05, |
|
"loss": 11.8367, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 57.16475095785441, |
|
"grad_norm": 1.46478271484375, |
|
"learning_rate": 1.4285919540229886e-05, |
|
"loss": 11.8926, |
|
"step": 74600 |
|
}, |
|
{ |
|
"epoch": 57.241379310344826, |
|
"grad_norm": 1.7883129119873047, |
|
"learning_rate": 1.4238026819923373e-05, |
|
"loss": 11.7109, |
|
"step": 74700 |
|
}, |
|
{ |
|
"epoch": 57.31800766283525, |
|
"grad_norm": 2.2156434059143066, |
|
"learning_rate": 1.419013409961686e-05, |
|
"loss": 11.9904, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 57.39463601532567, |
|
"grad_norm": 1.963996410369873, |
|
"learning_rate": 1.4142241379310345e-05, |
|
"loss": 11.8243, |
|
"step": 74900 |
|
}, |
|
{ |
|
"epoch": 57.47126436781609, |
|
"grad_norm": 1.5265462398529053, |
|
"learning_rate": 1.4094348659003831e-05, |
|
"loss": 11.982, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 57.547892720306514, |
|
"grad_norm": 1.5820256471633911, |
|
"learning_rate": 1.404645593869732e-05, |
|
"loss": 12.0055, |
|
"step": 75100 |
|
}, |
|
{ |
|
"epoch": 57.62452107279694, |
|
"grad_norm": 1.2654030323028564, |
|
"learning_rate": 1.3998563218390807e-05, |
|
"loss": 11.8634, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 57.701149425287355, |
|
"grad_norm": 2.1730732917785645, |
|
"learning_rate": 1.3950670498084292e-05, |
|
"loss": 12.098, |
|
"step": 75300 |
|
}, |
|
{ |
|
"epoch": 57.77777777777778, |
|
"grad_norm": 1.7732394933700562, |
|
"learning_rate": 1.3902777777777779e-05, |
|
"loss": 11.856, |
|
"step": 75400 |
|
}, |
|
{ |
|
"epoch": 57.8544061302682, |
|
"grad_norm": 1.366039514541626, |
|
"learning_rate": 1.3854885057471264e-05, |
|
"loss": 12.0139, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 57.93103448275862, |
|
"grad_norm": 2.9070754051208496, |
|
"learning_rate": 1.3806992337164751e-05, |
|
"loss": 11.9716, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_loss": 12.731040000915527, |
|
"eval_runtime": 44.0877, |
|
"eval_samples_per_second": 29.6, |
|
"eval_steps_per_second": 3.72, |
|
"step": 75690 |
|
}, |
|
{ |
|
"epoch": 58.00766283524904, |
|
"grad_norm": 2.1817991733551025, |
|
"learning_rate": 1.3759099616858236e-05, |
|
"loss": 11.906, |
|
"step": 75700 |
|
}, |
|
{ |
|
"epoch": 58.08429118773947, |
|
"grad_norm": 1.2766177654266357, |
|
"learning_rate": 1.3711206896551726e-05, |
|
"loss": 12.0479, |
|
"step": 75800 |
|
}, |
|
{ |
|
"epoch": 58.160919540229884, |
|
"grad_norm": 2.82973575592041, |
|
"learning_rate": 1.3663314176245212e-05, |
|
"loss": 11.947, |
|
"step": 75900 |
|
}, |
|
{ |
|
"epoch": 58.23754789272031, |
|
"grad_norm": 1.2385036945343018, |
|
"learning_rate": 1.3615421455938699e-05, |
|
"loss": 11.9196, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 58.31417624521073, |
|
"grad_norm": 1.3823829889297485, |
|
"learning_rate": 1.3567528735632184e-05, |
|
"loss": 11.9057, |
|
"step": 76100 |
|
}, |
|
{ |
|
"epoch": 58.39080459770115, |
|
"grad_norm": 1.472506046295166, |
|
"learning_rate": 1.351963601532567e-05, |
|
"loss": 11.9563, |
|
"step": 76200 |
|
}, |
|
{ |
|
"epoch": 58.46743295019157, |
|
"grad_norm": 1.5811665058135986, |
|
"learning_rate": 1.3472222222222222e-05, |
|
"loss": 11.8257, |
|
"step": 76300 |
|
}, |
|
{ |
|
"epoch": 58.54406130268199, |
|
"grad_norm": 1.5588597059249878, |
|
"learning_rate": 1.3424329501915708e-05, |
|
"loss": 11.8564, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 58.62068965517241, |
|
"grad_norm": 1.5810322761535645, |
|
"learning_rate": 1.3376436781609198e-05, |
|
"loss": 11.8566, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 58.69731800766284, |
|
"grad_norm": 1.5648218393325806, |
|
"learning_rate": 1.3328544061302683e-05, |
|
"loss": 11.9988, |
|
"step": 76600 |
|
}, |
|
{ |
|
"epoch": 58.77394636015325, |
|
"grad_norm": 1.8077315092086792, |
|
"learning_rate": 1.328065134099617e-05, |
|
"loss": 11.7739, |
|
"step": 76700 |
|
}, |
|
{ |
|
"epoch": 58.85057471264368, |
|
"grad_norm": 1.1517853736877441, |
|
"learning_rate": 1.3232758620689655e-05, |
|
"loss": 11.9046, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 58.9272030651341, |
|
"grad_norm": 1.4639145135879517, |
|
"learning_rate": 1.3184865900383142e-05, |
|
"loss": 11.99, |
|
"step": 76900 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_loss": 12.737883567810059, |
|
"eval_runtime": 44.0757, |
|
"eval_samples_per_second": 29.608, |
|
"eval_steps_per_second": 3.721, |
|
"step": 76995 |
|
}, |
|
{ |
|
"epoch": 59.00383141762452, |
|
"grad_norm": 0.9936187267303467, |
|
"learning_rate": 1.3136973180076629e-05, |
|
"loss": 11.9348, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 59.08045977011494, |
|
"grad_norm": 1.227501630783081, |
|
"learning_rate": 1.3089080459770114e-05, |
|
"loss": 11.9054, |
|
"step": 77100 |
|
}, |
|
{ |
|
"epoch": 59.157088122605366, |
|
"grad_norm": 1.1214205026626587, |
|
"learning_rate": 1.3041187739463603e-05, |
|
"loss": 11.7912, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 59.23371647509578, |
|
"grad_norm": 1.3010284900665283, |
|
"learning_rate": 1.299329501915709e-05, |
|
"loss": 11.8542, |
|
"step": 77300 |
|
}, |
|
{ |
|
"epoch": 59.310344827586206, |
|
"grad_norm": 1.291937232017517, |
|
"learning_rate": 1.2945402298850576e-05, |
|
"loss": 11.8613, |
|
"step": 77400 |
|
}, |
|
{ |
|
"epoch": 59.38697318007663, |
|
"grad_norm": 1.224834680557251, |
|
"learning_rate": 1.2897509578544062e-05, |
|
"loss": 11.905, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 59.46360153256705, |
|
"grad_norm": 1.308899998664856, |
|
"learning_rate": 1.2849616858237548e-05, |
|
"loss": 11.9067, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 59.54022988505747, |
|
"grad_norm": 1.4333239793777466, |
|
"learning_rate": 1.2801724137931034e-05, |
|
"loss": 11.8825, |
|
"step": 77700 |
|
}, |
|
{ |
|
"epoch": 59.616858237547895, |
|
"grad_norm": 1.0542117357254028, |
|
"learning_rate": 1.275383141762452e-05, |
|
"loss": 12.1948, |
|
"step": 77800 |
|
}, |
|
{ |
|
"epoch": 59.69348659003831, |
|
"grad_norm": 1.9502829313278198, |
|
"learning_rate": 1.2705938697318009e-05, |
|
"loss": 11.9644, |
|
"step": 77900 |
|
}, |
|
{ |
|
"epoch": 59.770114942528735, |
|
"grad_norm": 1.3281497955322266, |
|
"learning_rate": 1.2658045977011496e-05, |
|
"loss": 11.8953, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 59.84674329501916, |
|
"grad_norm": 1.2546237707138062, |
|
"learning_rate": 1.2610153256704981e-05, |
|
"loss": 11.8375, |
|
"step": 78100 |
|
}, |
|
{ |
|
"epoch": 59.923371647509576, |
|
"grad_norm": 1.1630369424819946, |
|
"learning_rate": 1.2562260536398468e-05, |
|
"loss": 11.7133, |
|
"step": 78200 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 1.7483701705932617, |
|
"learning_rate": 1.2514367816091955e-05, |
|
"loss": 12.2012, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 12.731696128845215, |
|
"eval_runtime": 44.1463, |
|
"eval_samples_per_second": 29.561, |
|
"eval_steps_per_second": 3.715, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 60.076628352490424, |
|
"grad_norm": 2.260547399520874, |
|
"learning_rate": 1.2466954022988505e-05, |
|
"loss": 11.9756, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 60.15325670498084, |
|
"grad_norm": 1.387416124343872, |
|
"learning_rate": 1.2419061302681993e-05, |
|
"loss": 11.9715, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 60.229885057471265, |
|
"grad_norm": 4.537426948547363, |
|
"learning_rate": 1.2371168582375479e-05, |
|
"loss": 11.6355, |
|
"step": 78600 |
|
}, |
|
{ |
|
"epoch": 60.30651340996169, |
|
"grad_norm": 1.930817723274231, |
|
"learning_rate": 1.2323275862068966e-05, |
|
"loss": 11.6992, |
|
"step": 78700 |
|
}, |
|
{ |
|
"epoch": 60.383141762452105, |
|
"grad_norm": 1.7206836938858032, |
|
"learning_rate": 1.2275383141762452e-05, |
|
"loss": 11.8606, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 60.45977011494253, |
|
"grad_norm": 1.7796626091003418, |
|
"learning_rate": 1.222749042145594e-05, |
|
"loss": 11.8648, |
|
"step": 78900 |
|
}, |
|
{ |
|
"epoch": 60.53639846743295, |
|
"grad_norm": 1.6132935285568237, |
|
"learning_rate": 1.2179597701149426e-05, |
|
"loss": 11.7958, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 60.61302681992337, |
|
"grad_norm": 1.2063769102096558, |
|
"learning_rate": 1.2131704980842913e-05, |
|
"loss": 11.8877, |
|
"step": 79100 |
|
}, |
|
{ |
|
"epoch": 60.689655172413794, |
|
"grad_norm": 1.6793837547302246, |
|
"learning_rate": 1.20838122605364e-05, |
|
"loss": 11.9401, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 60.76628352490422, |
|
"grad_norm": 2.0831589698791504, |
|
"learning_rate": 1.2035919540229885e-05, |
|
"loss": 11.832, |
|
"step": 79300 |
|
}, |
|
{ |
|
"epoch": 60.842911877394634, |
|
"grad_norm": 1.4812095165252686, |
|
"learning_rate": 1.1988026819923372e-05, |
|
"loss": 12.0039, |
|
"step": 79400 |
|
}, |
|
{ |
|
"epoch": 60.91954022988506, |
|
"grad_norm": 2.111269474029541, |
|
"learning_rate": 1.1940134099616859e-05, |
|
"loss": 12.0629, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 60.99616858237548, |
|
"grad_norm": 1.0717095136642456, |
|
"learning_rate": 1.1892241379310346e-05, |
|
"loss": 11.7839, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_loss": 12.74968433380127, |
|
"eval_runtime": 44.1528, |
|
"eval_samples_per_second": 29.556, |
|
"eval_steps_per_second": 3.714, |
|
"step": 79605 |
|
}, |
|
{ |
|
"epoch": 61.0727969348659, |
|
"grad_norm": 2.625854969024658, |
|
"learning_rate": 1.1844348659003831e-05, |
|
"loss": 11.9218, |
|
"step": 79700 |
|
}, |
|
{ |
|
"epoch": 61.14942528735632, |
|
"grad_norm": 1.9146480560302734, |
|
"learning_rate": 1.179645593869732e-05, |
|
"loss": 11.6761, |
|
"step": 79800 |
|
}, |
|
{ |
|
"epoch": 61.22605363984675, |
|
"grad_norm": 0.9696165919303894, |
|
"learning_rate": 1.1748563218390805e-05, |
|
"loss": 11.9288, |
|
"step": 79900 |
|
}, |
|
{ |
|
"epoch": 61.30268199233716, |
|
"grad_norm": 1.1847577095031738, |
|
"learning_rate": 1.1700670498084292e-05, |
|
"loss": 11.9674, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 61.37931034482759, |
|
"grad_norm": 1.3804477453231812, |
|
"learning_rate": 1.1652777777777778e-05, |
|
"loss": 11.812, |
|
"step": 80100 |
|
}, |
|
{ |
|
"epoch": 61.45593869731801, |
|
"grad_norm": 1.6096410751342773, |
|
"learning_rate": 1.1604885057471265e-05, |
|
"loss": 11.8585, |
|
"step": 80200 |
|
}, |
|
{ |
|
"epoch": 61.53256704980843, |
|
"grad_norm": 1.8098353147506714, |
|
"learning_rate": 1.1556992337164752e-05, |
|
"loss": 11.8667, |
|
"step": 80300 |
|
}, |
|
{ |
|
"epoch": 61.60919540229885, |
|
"grad_norm": 6.6866068840026855, |
|
"learning_rate": 1.1509099616858237e-05, |
|
"loss": 11.8999, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 61.68582375478927, |
|
"grad_norm": 2.7860629558563232, |
|
"learning_rate": 1.1461206896551726e-05, |
|
"loss": 11.8976, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 61.76245210727969, |
|
"grad_norm": 1.7936979532241821, |
|
"learning_rate": 1.1413314176245211e-05, |
|
"loss": 11.913, |
|
"step": 80600 |
|
}, |
|
{ |
|
"epoch": 61.839080459770116, |
|
"grad_norm": 1.7207527160644531, |
|
"learning_rate": 1.1365421455938698e-05, |
|
"loss": 12.0002, |
|
"step": 80700 |
|
}, |
|
{ |
|
"epoch": 61.91570881226053, |
|
"grad_norm": 2.8500571250915527, |
|
"learning_rate": 1.1317528735632183e-05, |
|
"loss": 12.0012, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 61.99233716475096, |
|
"grad_norm": 2.1529831886291504, |
|
"learning_rate": 1.1269636015325672e-05, |
|
"loss": 11.9888, |
|
"step": 80900 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_loss": 12.742037773132324, |
|
"eval_runtime": 44.1517, |
|
"eval_samples_per_second": 29.557, |
|
"eval_steps_per_second": 3.714, |
|
"step": 80910 |
|
}, |
|
{ |
|
"epoch": 62.06896551724138, |
|
"grad_norm": 1.1954108476638794, |
|
"learning_rate": 1.1221743295019157e-05, |
|
"loss": 11.9691, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 62.1455938697318, |
|
"grad_norm": 1.253891944885254, |
|
"learning_rate": 1.1174329501915709e-05, |
|
"loss": 12.0618, |
|
"step": 81100 |
|
}, |
|
{ |
|
"epoch": 62.22222222222222, |
|
"grad_norm": 1.5132429599761963, |
|
"learning_rate": 1.1126436781609197e-05, |
|
"loss": 11.9311, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 62.298850574712645, |
|
"grad_norm": 1.215069055557251, |
|
"learning_rate": 1.1078544061302683e-05, |
|
"loss": 11.7015, |
|
"step": 81300 |
|
}, |
|
{ |
|
"epoch": 62.37547892720306, |
|
"grad_norm": 2.0881459712982178, |
|
"learning_rate": 1.103065134099617e-05, |
|
"loss": 12.0909, |
|
"step": 81400 |
|
}, |
|
{ |
|
"epoch": 62.452107279693486, |
|
"grad_norm": 1.079714298248291, |
|
"learning_rate": 1.0982758620689655e-05, |
|
"loss": 11.9608, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 62.52873563218391, |
|
"grad_norm": 1.3947062492370605, |
|
"learning_rate": 1.0934865900383143e-05, |
|
"loss": 11.8452, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 62.60536398467433, |
|
"grad_norm": 1.0822895765304565, |
|
"learning_rate": 1.0886973180076628e-05, |
|
"loss": 11.8232, |
|
"step": 81700 |
|
}, |
|
{ |
|
"epoch": 62.68199233716475, |
|
"grad_norm": 1.6000736951828003, |
|
"learning_rate": 1.0839080459770115e-05, |
|
"loss": 11.994, |
|
"step": 81800 |
|
}, |
|
{ |
|
"epoch": 62.758620689655174, |
|
"grad_norm": 1.6020923852920532, |
|
"learning_rate": 1.0791187739463602e-05, |
|
"loss": 11.9019, |
|
"step": 81900 |
|
}, |
|
{ |
|
"epoch": 62.83524904214559, |
|
"grad_norm": 1.4164994955062866, |
|
"learning_rate": 1.0743295019157089e-05, |
|
"loss": 11.8139, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 62.911877394636015, |
|
"grad_norm": 2.334690570831299, |
|
"learning_rate": 1.0695402298850576e-05, |
|
"loss": 12.0714, |
|
"step": 82100 |
|
}, |
|
{ |
|
"epoch": 62.98850574712644, |
|
"grad_norm": 1.8338385820388794, |
|
"learning_rate": 1.0647509578544061e-05, |
|
"loss": 11.8382, |
|
"step": 82200 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_loss": 12.733258247375488, |
|
"eval_runtime": 44.1527, |
|
"eval_samples_per_second": 29.557, |
|
"eval_steps_per_second": 3.714, |
|
"step": 82215 |
|
}, |
|
{ |
|
"epoch": 63.065134099616856, |
|
"grad_norm": 3.91227650642395, |
|
"learning_rate": 1.059961685823755e-05, |
|
"loss": 11.9929, |
|
"step": 82300 |
|
}, |
|
{ |
|
"epoch": 63.14176245210728, |
|
"grad_norm": 1.1621551513671875, |
|
"learning_rate": 1.0551724137931035e-05, |
|
"loss": 11.9456, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 63.2183908045977, |
|
"grad_norm": 1.4154562950134277, |
|
"learning_rate": 1.0503831417624522e-05, |
|
"loss": 12.0645, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 63.29501915708812, |
|
"grad_norm": 1.8987462520599365, |
|
"learning_rate": 1.0455938697318009e-05, |
|
"loss": 11.873, |
|
"step": 82600 |
|
}, |
|
{ |
|
"epoch": 63.371647509578544, |
|
"grad_norm": 1.8300188779830933, |
|
"learning_rate": 1.0408045977011495e-05, |
|
"loss": 11.7687, |
|
"step": 82700 |
|
}, |
|
{ |
|
"epoch": 63.44827586206897, |
|
"grad_norm": 1.4220359325408936, |
|
"learning_rate": 1.036015325670498e-05, |
|
"loss": 11.8298, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 63.524904214559385, |
|
"grad_norm": 1.1422735452651978, |
|
"learning_rate": 1.0312260536398468e-05, |
|
"loss": 11.9857, |
|
"step": 82900 |
|
}, |
|
{ |
|
"epoch": 63.60153256704981, |
|
"grad_norm": 1.6723980903625488, |
|
"learning_rate": 1.0264367816091954e-05, |
|
"loss": 11.6692, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 63.67816091954023, |
|
"grad_norm": 1.3438162803649902, |
|
"learning_rate": 1.0216954022988506e-05, |
|
"loss": 11.8703, |
|
"step": 83100 |
|
}, |
|
{ |
|
"epoch": 63.75478927203065, |
|
"grad_norm": 1.2540138959884644, |
|
"learning_rate": 1.0169061302681993e-05, |
|
"loss": 11.8198, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 63.83141762452107, |
|
"grad_norm": 1.439274787902832, |
|
"learning_rate": 1.012116858237548e-05, |
|
"loss": 11.8904, |
|
"step": 83300 |
|
}, |
|
{ |
|
"epoch": 63.9080459770115, |
|
"grad_norm": 1.0765241384506226, |
|
"learning_rate": 1.0073275862068967e-05, |
|
"loss": 11.8521, |
|
"step": 83400 |
|
}, |
|
{ |
|
"epoch": 63.984674329501914, |
|
"grad_norm": 1.066419005393982, |
|
"learning_rate": 1.0025383141762452e-05, |
|
"loss": 11.8361, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_loss": 12.740053176879883, |
|
"eval_runtime": 44.1473, |
|
"eval_samples_per_second": 29.56, |
|
"eval_steps_per_second": 3.715, |
|
"step": 83520 |
|
}, |
|
{ |
|
"epoch": 64.06130268199233, |
|
"grad_norm": 1.2648850679397583, |
|
"learning_rate": 9.977490421455939e-06, |
|
"loss": 12.1, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 64.13793103448276, |
|
"grad_norm": 1.115157961845398, |
|
"learning_rate": 9.929597701149426e-06, |
|
"loss": 11.798, |
|
"step": 83700 |
|
}, |
|
{ |
|
"epoch": 64.21455938697318, |
|
"grad_norm": 1.6352553367614746, |
|
"learning_rate": 9.881704980842913e-06, |
|
"loss": 11.9761, |
|
"step": 83800 |
|
}, |
|
{ |
|
"epoch": 64.2911877394636, |
|
"grad_norm": 1.2003965377807617, |
|
"learning_rate": 9.833812260536398e-06, |
|
"loss": 11.9813, |
|
"step": 83900 |
|
}, |
|
{ |
|
"epoch": 64.36781609195403, |
|
"grad_norm": 1.5004589557647705, |
|
"learning_rate": 9.785919540229886e-06, |
|
"loss": 11.7826, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 64.44444444444444, |
|
"grad_norm": 1.3350985050201416, |
|
"learning_rate": 9.738026819923372e-06, |
|
"loss": 11.8015, |
|
"step": 84100 |
|
}, |
|
{ |
|
"epoch": 64.52107279693486, |
|
"grad_norm": 1.5985853672027588, |
|
"learning_rate": 9.690134099616858e-06, |
|
"loss": 11.6736, |
|
"step": 84200 |
|
}, |
|
{ |
|
"epoch": 64.59770114942529, |
|
"grad_norm": 2.1115546226501465, |
|
"learning_rate": 9.642241379310345e-06, |
|
"loss": 11.7572, |
|
"step": 84300 |
|
}, |
|
{ |
|
"epoch": 64.67432950191571, |
|
"grad_norm": 2.5769665241241455, |
|
"learning_rate": 9.594348659003832e-06, |
|
"loss": 11.8057, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 64.75095785440612, |
|
"grad_norm": 3.2280073165893555, |
|
"learning_rate": 9.546455938697319e-06, |
|
"loss": 11.9184, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 64.82758620689656, |
|
"grad_norm": 1.2311729192733765, |
|
"learning_rate": 9.498563218390804e-06, |
|
"loss": 11.9657, |
|
"step": 84600 |
|
}, |
|
{ |
|
"epoch": 64.90421455938697, |
|
"grad_norm": 1.6303430795669556, |
|
"learning_rate": 9.450670498084293e-06, |
|
"loss": 11.9864, |
|
"step": 84700 |
|
}, |
|
{ |
|
"epoch": 64.98084291187739, |
|
"grad_norm": 1.6421687602996826, |
|
"learning_rate": 9.402777777777778e-06, |
|
"loss": 11.8224, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_loss": 12.752345085144043, |
|
"eval_runtime": 44.1763, |
|
"eval_samples_per_second": 29.541, |
|
"eval_steps_per_second": 3.712, |
|
"step": 84825 |
|
}, |
|
{ |
|
"epoch": 65.05747126436782, |
|
"grad_norm": 1.2040326595306396, |
|
"learning_rate": 9.354885057471265e-06, |
|
"loss": 11.7626, |
|
"step": 84900 |
|
}, |
|
{ |
|
"epoch": 65.13409961685824, |
|
"grad_norm": 1.1865389347076416, |
|
"learning_rate": 9.30699233716475e-06, |
|
"loss": 12.015, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 65.21072796934865, |
|
"grad_norm": 2.0402724742889404, |
|
"learning_rate": 9.259099616858239e-06, |
|
"loss": 11.8473, |
|
"step": 85100 |
|
}, |
|
{ |
|
"epoch": 65.28735632183908, |
|
"grad_norm": 1.8505759239196777, |
|
"learning_rate": 9.21168582375479e-06, |
|
"loss": 11.9353, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 65.3639846743295, |
|
"grad_norm": 2.3651750087738037, |
|
"learning_rate": 9.163793103448276e-06, |
|
"loss": 12.0637, |
|
"step": 85300 |
|
}, |
|
{ |
|
"epoch": 65.44061302681992, |
|
"grad_norm": 1.9731732606887817, |
|
"learning_rate": 9.115900383141762e-06, |
|
"loss": 12.0013, |
|
"step": 85400 |
|
}, |
|
{ |
|
"epoch": 65.51724137931035, |
|
"grad_norm": 1.3928194046020508, |
|
"learning_rate": 9.06800766283525e-06, |
|
"loss": 11.6937, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 65.59386973180077, |
|
"grad_norm": 1.580771565437317, |
|
"learning_rate": 9.020114942528736e-06, |
|
"loss": 11.5997, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 65.67049808429118, |
|
"grad_norm": 1.143648624420166, |
|
"learning_rate": 8.972222222222221e-06, |
|
"loss": 11.948, |
|
"step": 85700 |
|
}, |
|
{ |
|
"epoch": 65.74712643678161, |
|
"grad_norm": 1.9105567932128906, |
|
"learning_rate": 8.92432950191571e-06, |
|
"loss": 11.9796, |
|
"step": 85800 |
|
}, |
|
{ |
|
"epoch": 65.82375478927203, |
|
"grad_norm": 1.3926714658737183, |
|
"learning_rate": 8.876436781609195e-06, |
|
"loss": 11.7775, |
|
"step": 85900 |
|
}, |
|
{ |
|
"epoch": 65.90038314176245, |
|
"grad_norm": 1.1419901847839355, |
|
"learning_rate": 8.828544061302682e-06, |
|
"loss": 11.7615, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 65.97701149425288, |
|
"grad_norm": 1.6939061880111694, |
|
"learning_rate": 8.780651340996169e-06, |
|
"loss": 11.8244, |
|
"step": 86100 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_loss": 12.737361907958984, |
|
"eval_runtime": 44.1505, |
|
"eval_samples_per_second": 29.558, |
|
"eval_steps_per_second": 3.715, |
|
"step": 86130 |
|
}, |
|
{ |
|
"epoch": 66.0536398467433, |
|
"grad_norm": 1.953165054321289, |
|
"learning_rate": 8.732758620689656e-06, |
|
"loss": 11.9442, |
|
"step": 86200 |
|
}, |
|
{ |
|
"epoch": 66.13026819923371, |
|
"grad_norm": 2.1596179008483887, |
|
"learning_rate": 8.684865900383143e-06, |
|
"loss": 11.764, |
|
"step": 86300 |
|
}, |
|
{ |
|
"epoch": 66.20689655172414, |
|
"grad_norm": 1.4609719514846802, |
|
"learning_rate": 8.636973180076628e-06, |
|
"loss": 12.1997, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 66.28352490421456, |
|
"grad_norm": 2.0631511211395264, |
|
"learning_rate": 8.589080459770116e-06, |
|
"loss": 11.8684, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 66.36015325670498, |
|
"grad_norm": 1.4530664682388306, |
|
"learning_rate": 8.541187739463602e-06, |
|
"loss": 11.8307, |
|
"step": 86600 |
|
}, |
|
{ |
|
"epoch": 66.4367816091954, |
|
"grad_norm": 2.148606777191162, |
|
"learning_rate": 8.493295019157089e-06, |
|
"loss": 11.9725, |
|
"step": 86700 |
|
}, |
|
{ |
|
"epoch": 66.51340996168582, |
|
"grad_norm": 1.8974863290786743, |
|
"learning_rate": 8.445402298850575e-06, |
|
"loss": 11.9907, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 66.59003831417624, |
|
"grad_norm": 2.369657278060913, |
|
"learning_rate": 8.397509578544062e-06, |
|
"loss": 11.9563, |
|
"step": 86900 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 1.6854480504989624, |
|
"learning_rate": 8.349616858237547e-06, |
|
"loss": 11.9173, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 66.74329501915709, |
|
"grad_norm": 1.6539610624313354, |
|
"learning_rate": 8.301724137931034e-06, |
|
"loss": 11.9584, |
|
"step": 87100 |
|
}, |
|
{ |
|
"epoch": 66.8199233716475, |
|
"grad_norm": 1.346731424331665, |
|
"learning_rate": 8.253831417624521e-06, |
|
"loss": 11.7909, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 66.89655172413794, |
|
"grad_norm": 1.6548290252685547, |
|
"learning_rate": 8.206417624521073e-06, |
|
"loss": 11.9346, |
|
"step": 87300 |
|
}, |
|
{ |
|
"epoch": 66.97318007662835, |
|
"grad_norm": 1.1189563274383545, |
|
"learning_rate": 8.15852490421456e-06, |
|
"loss": 11.9832, |
|
"step": 87400 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_loss": 12.747148513793945, |
|
"eval_runtime": 44.147, |
|
"eval_samples_per_second": 29.56, |
|
"eval_steps_per_second": 3.715, |
|
"step": 87435 |
|
}, |
|
{ |
|
"epoch": 67.04980842911877, |
|
"grad_norm": 1.7302024364471436, |
|
"learning_rate": 8.110632183908045e-06, |
|
"loss": 11.8374, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 67.1264367816092, |
|
"grad_norm": 0.8793215751647949, |
|
"learning_rate": 8.062739463601534e-06, |
|
"loss": 11.7415, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 67.20306513409962, |
|
"grad_norm": 1.1903204917907715, |
|
"learning_rate": 8.014846743295019e-06, |
|
"loss": 11.8223, |
|
"step": 87700 |
|
}, |
|
{ |
|
"epoch": 67.27969348659003, |
|
"grad_norm": 2.025223731994629, |
|
"learning_rate": 7.966954022988506e-06, |
|
"loss": 11.7065, |
|
"step": 87800 |
|
}, |
|
{ |
|
"epoch": 67.35632183908046, |
|
"grad_norm": 1.2028359174728394, |
|
"learning_rate": 7.919061302681993e-06, |
|
"loss": 11.9446, |
|
"step": 87900 |
|
}, |
|
{ |
|
"epoch": 67.43295019157088, |
|
"grad_norm": 1.56088387966156, |
|
"learning_rate": 7.87116858237548e-06, |
|
"loss": 12.0176, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 67.5095785440613, |
|
"grad_norm": 1.4466462135314941, |
|
"learning_rate": 7.823275862068966e-06, |
|
"loss": 11.8777, |
|
"step": 88100 |
|
}, |
|
{ |
|
"epoch": 67.58620689655173, |
|
"grad_norm": 2.2348804473876953, |
|
"learning_rate": 7.775383141762453e-06, |
|
"loss": 11.8506, |
|
"step": 88200 |
|
}, |
|
{ |
|
"epoch": 67.66283524904215, |
|
"grad_norm": 1.0889838933944702, |
|
"learning_rate": 7.72749042145594e-06, |
|
"loss": 11.9706, |
|
"step": 88300 |
|
}, |
|
{ |
|
"epoch": 67.73946360153256, |
|
"grad_norm": 1.6289935111999512, |
|
"learning_rate": 7.679597701149425e-06, |
|
"loss": 11.9588, |
|
"step": 88400 |
|
}, |
|
{ |
|
"epoch": 67.816091954023, |
|
"grad_norm": 1.2480045557022095, |
|
"learning_rate": 7.631704980842912e-06, |
|
"loss": 11.7933, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 67.89272030651341, |
|
"grad_norm": 1.5679010152816772, |
|
"learning_rate": 7.583812260536399e-06, |
|
"loss": 12.0495, |
|
"step": 88600 |
|
}, |
|
{ |
|
"epoch": 67.96934865900383, |
|
"grad_norm": 1.2820953130722046, |
|
"learning_rate": 7.535919540229885e-06, |
|
"loss": 11.8478, |
|
"step": 88700 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_loss": 12.716951370239258, |
|
"eval_runtime": 44.1526, |
|
"eval_samples_per_second": 29.557, |
|
"eval_steps_per_second": 3.714, |
|
"step": 88740 |
|
}, |
|
{ |
|
"epoch": 68.04597701149426, |
|
"grad_norm": 1.0503605604171753, |
|
"learning_rate": 7.488026819923372e-06, |
|
"loss": 11.9092, |
|
"step": 88800 |
|
}, |
|
{ |
|
"epoch": 68.12260536398468, |
|
"grad_norm": 1.5500402450561523, |
|
"learning_rate": 7.440134099616859e-06, |
|
"loss": 11.933, |
|
"step": 88900 |
|
}, |
|
{ |
|
"epoch": 68.19923371647509, |
|
"grad_norm": 2.4164953231811523, |
|
"learning_rate": 7.392241379310346e-06, |
|
"loss": 11.8528, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 68.27586206896552, |
|
"grad_norm": 1.7877123355865479, |
|
"learning_rate": 7.344348659003832e-06, |
|
"loss": 11.8459, |
|
"step": 89100 |
|
}, |
|
{ |
|
"epoch": 68.35249042145594, |
|
"grad_norm": 1.6601005792617798, |
|
"learning_rate": 7.296455938697318e-06, |
|
"loss": 11.8986, |
|
"step": 89200 |
|
}, |
|
{ |
|
"epoch": 68.42911877394636, |
|
"grad_norm": 1.6431148052215576, |
|
"learning_rate": 7.24904214559387e-06, |
|
"loss": 11.8467, |
|
"step": 89300 |
|
}, |
|
{ |
|
"epoch": 68.50574712643679, |
|
"grad_norm": 1.2147421836853027, |
|
"learning_rate": 7.201149425287357e-06, |
|
"loss": 11.9989, |
|
"step": 89400 |
|
}, |
|
{ |
|
"epoch": 68.5823754789272, |
|
"grad_norm": 1.0646436214447021, |
|
"learning_rate": 7.153256704980843e-06, |
|
"loss": 11.6439, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 68.65900383141762, |
|
"grad_norm": 1.494936466217041, |
|
"learning_rate": 7.105363984674329e-06, |
|
"loss": 11.8232, |
|
"step": 89600 |
|
}, |
|
{ |
|
"epoch": 68.73563218390805, |
|
"grad_norm": 1.1928653717041016, |
|
"learning_rate": 7.057471264367817e-06, |
|
"loss": 12.032, |
|
"step": 89700 |
|
}, |
|
{ |
|
"epoch": 68.81226053639847, |
|
"grad_norm": 1.2193999290466309, |
|
"learning_rate": 7.009578544061303e-06, |
|
"loss": 11.8999, |
|
"step": 89800 |
|
}, |
|
{ |
|
"epoch": 68.88888888888889, |
|
"grad_norm": 1.418272852897644, |
|
"learning_rate": 6.961685823754789e-06, |
|
"loss": 12.0139, |
|
"step": 89900 |
|
}, |
|
{ |
|
"epoch": 68.96551724137932, |
|
"grad_norm": 2.331040620803833, |
|
"learning_rate": 6.913793103448277e-06, |
|
"loss": 12.0201, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_loss": 12.731438636779785, |
|
"eval_runtime": 44.1419, |
|
"eval_samples_per_second": 29.564, |
|
"eval_steps_per_second": 3.715, |
|
"step": 90045 |
|
}, |
|
{ |
|
"epoch": 69.04214559386973, |
|
"grad_norm": 1.2469091415405273, |
|
"learning_rate": 6.865900383141763e-06, |
|
"loss": 11.7182, |
|
"step": 90100 |
|
}, |
|
{ |
|
"epoch": 69.11877394636015, |
|
"grad_norm": 1.299902319908142, |
|
"learning_rate": 6.818007662835249e-06, |
|
"loss": 11.908, |
|
"step": 90200 |
|
}, |
|
{ |
|
"epoch": 69.19540229885058, |
|
"grad_norm": 2.0446414947509766, |
|
"learning_rate": 6.770114942528737e-06, |
|
"loss": 11.8736, |
|
"step": 90300 |
|
}, |
|
{ |
|
"epoch": 69.272030651341, |
|
"grad_norm": 2.1058554649353027, |
|
"learning_rate": 6.722222222222223e-06, |
|
"loss": 11.7726, |
|
"step": 90400 |
|
}, |
|
{ |
|
"epoch": 69.34865900383141, |
|
"grad_norm": 1.222571849822998, |
|
"learning_rate": 6.674329501915709e-06, |
|
"loss": 12.1008, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 69.42528735632185, |
|
"grad_norm": 1.2086107730865479, |
|
"learning_rate": 6.6264367816091955e-06, |
|
"loss": 11.9332, |
|
"step": 90600 |
|
}, |
|
{ |
|
"epoch": 69.50191570881226, |
|
"grad_norm": 1.188658356666565, |
|
"learning_rate": 6.578544061302682e-06, |
|
"loss": 11.9603, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 69.57854406130268, |
|
"grad_norm": 1.1233985424041748, |
|
"learning_rate": 6.530651340996169e-06, |
|
"loss": 11.7879, |
|
"step": 90800 |
|
}, |
|
{ |
|
"epoch": 69.65517241379311, |
|
"grad_norm": 1.8599299192428589, |
|
"learning_rate": 6.482758620689655e-06, |
|
"loss": 12.0864, |
|
"step": 90900 |
|
}, |
|
{ |
|
"epoch": 69.73180076628353, |
|
"grad_norm": 1.213908076286316, |
|
"learning_rate": 6.434865900383143e-06, |
|
"loss": 11.7091, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 69.80842911877394, |
|
"grad_norm": 1.2682372331619263, |
|
"learning_rate": 6.386973180076629e-06, |
|
"loss": 11.8762, |
|
"step": 91100 |
|
}, |
|
{ |
|
"epoch": 69.88505747126437, |
|
"grad_norm": 1.940184473991394, |
|
"learning_rate": 6.339080459770115e-06, |
|
"loss": 11.6487, |
|
"step": 91200 |
|
}, |
|
{ |
|
"epoch": 69.96168582375479, |
|
"grad_norm": 1.4338123798370361, |
|
"learning_rate": 6.291187739463601e-06, |
|
"loss": 12.152, |
|
"step": 91300 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_loss": 12.735883712768555, |
|
"eval_runtime": 44.179, |
|
"eval_samples_per_second": 29.539, |
|
"eval_steps_per_second": 3.712, |
|
"step": 91350 |
|
}, |
|
{ |
|
"epoch": 70.03831417624521, |
|
"grad_norm": 2.018376111984253, |
|
"learning_rate": 6.243295019157088e-06, |
|
"loss": 11.9978, |
|
"step": 91400 |
|
}, |
|
{ |
|
"epoch": 70.11494252873563, |
|
"grad_norm": 1.4965932369232178, |
|
"learning_rate": 6.195881226053641e-06, |
|
"loss": 11.9588, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 70.19157088122606, |
|
"grad_norm": 1.5459176301956177, |
|
"learning_rate": 6.147988505747127e-06, |
|
"loss": 11.7513, |
|
"step": 91600 |
|
}, |
|
{ |
|
"epoch": 70.26819923371647, |
|
"grad_norm": 1.6559784412384033, |
|
"learning_rate": 6.1000957854406135e-06, |
|
"loss": 11.8124, |
|
"step": 91700 |
|
}, |
|
{ |
|
"epoch": 70.34482758620689, |
|
"grad_norm": 2.100288152694702, |
|
"learning_rate": 6.0522030651341e-06, |
|
"loss": 11.8001, |
|
"step": 91800 |
|
}, |
|
{ |
|
"epoch": 70.42145593869732, |
|
"grad_norm": 2.0167760848999023, |
|
"learning_rate": 6.0043103448275864e-06, |
|
"loss": 11.7079, |
|
"step": 91900 |
|
}, |
|
{ |
|
"epoch": 70.49808429118774, |
|
"grad_norm": 1.2484099864959717, |
|
"learning_rate": 5.956417624521073e-06, |
|
"loss": 11.8747, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 70.57471264367815, |
|
"grad_norm": 1.4585705995559692, |
|
"learning_rate": 5.908524904214559e-06, |
|
"loss": 11.6371, |
|
"step": 92100 |
|
}, |
|
{ |
|
"epoch": 70.65134099616859, |
|
"grad_norm": 1.2680083513259888, |
|
"learning_rate": 5.860632183908046e-06, |
|
"loss": 11.8783, |
|
"step": 92200 |
|
}, |
|
{ |
|
"epoch": 70.727969348659, |
|
"grad_norm": 3.2429590225219727, |
|
"learning_rate": 5.812739463601532e-06, |
|
"loss": 12.0867, |
|
"step": 92300 |
|
}, |
|
{ |
|
"epoch": 70.80459770114942, |
|
"grad_norm": 1.6496800184249878, |
|
"learning_rate": 5.764846743295019e-06, |
|
"loss": 11.8665, |
|
"step": 92400 |
|
}, |
|
{ |
|
"epoch": 70.88122605363985, |
|
"grad_norm": 1.7092400789260864, |
|
"learning_rate": 5.716954022988506e-06, |
|
"loss": 11.8957, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 70.95785440613027, |
|
"grad_norm": 1.308349370956421, |
|
"learning_rate": 5.669061302681993e-06, |
|
"loss": 11.6562, |
|
"step": 92600 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_loss": 12.738100051879883, |
|
"eval_runtime": 44.1855, |
|
"eval_samples_per_second": 29.535, |
|
"eval_steps_per_second": 3.712, |
|
"step": 92655 |
|
}, |
|
{ |
|
"epoch": 71.03448275862068, |
|
"grad_norm": 1.4456454515457153, |
|
"learning_rate": 5.62116858237548e-06, |
|
"loss": 11.9577, |
|
"step": 92700 |
|
}, |
|
{ |
|
"epoch": 71.11111111111111, |
|
"grad_norm": 1.178861141204834, |
|
"learning_rate": 5.573275862068966e-06, |
|
"loss": 11.7769, |
|
"step": 92800 |
|
}, |
|
{ |
|
"epoch": 71.18773946360153, |
|
"grad_norm": 1.2721989154815674, |
|
"learning_rate": 5.525383141762453e-06, |
|
"loss": 12.0604, |
|
"step": 92900 |
|
}, |
|
{ |
|
"epoch": 71.26436781609195, |
|
"grad_norm": 1.4360485076904297, |
|
"learning_rate": 5.4774904214559396e-06, |
|
"loss": 11.853, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 71.34099616858238, |
|
"grad_norm": 1.1324783563613892, |
|
"learning_rate": 5.429597701149426e-06, |
|
"loss": 12.0389, |
|
"step": 93100 |
|
}, |
|
{ |
|
"epoch": 71.4176245210728, |
|
"grad_norm": 1.327430009841919, |
|
"learning_rate": 5.3817049808429125e-06, |
|
"loss": 12.1736, |
|
"step": 93200 |
|
}, |
|
{ |
|
"epoch": 71.49425287356321, |
|
"grad_norm": 1.7536532878875732, |
|
"learning_rate": 5.3338122605363985e-06, |
|
"loss": 11.8394, |
|
"step": 93300 |
|
}, |
|
{ |
|
"epoch": 71.57088122605364, |
|
"grad_norm": 1.2314512729644775, |
|
"learning_rate": 5.285919540229885e-06, |
|
"loss": 11.8958, |
|
"step": 93400 |
|
}, |
|
{ |
|
"epoch": 71.64750957854406, |
|
"grad_norm": 1.3814700841903687, |
|
"learning_rate": 5.2380268199233714e-06, |
|
"loss": 11.8036, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 71.72413793103448, |
|
"grad_norm": 1.6986061334609985, |
|
"learning_rate": 5.190134099616858e-06, |
|
"loss": 11.7598, |
|
"step": 93600 |
|
}, |
|
{ |
|
"epoch": 71.80076628352491, |
|
"grad_norm": 1.1988410949707031, |
|
"learning_rate": 5.142241379310345e-06, |
|
"loss": 11.7643, |
|
"step": 93700 |
|
}, |
|
{ |
|
"epoch": 71.87739463601532, |
|
"grad_norm": 1.005979061126709, |
|
"learning_rate": 5.094827586206897e-06, |
|
"loss": 11.8694, |
|
"step": 93800 |
|
}, |
|
{ |
|
"epoch": 71.95402298850574, |
|
"grad_norm": 1.8171489238739014, |
|
"learning_rate": 5.046934865900384e-06, |
|
"loss": 11.7541, |
|
"step": 93900 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_loss": 12.730957984924316, |
|
"eval_runtime": 44.1811, |
|
"eval_samples_per_second": 29.538, |
|
"eval_steps_per_second": 3.712, |
|
"step": 93960 |
|
}, |
|
{ |
|
"epoch": 72.03065134099617, |
|
"grad_norm": 1.2113227844238281, |
|
"learning_rate": 4.99904214559387e-06, |
|
"loss": 11.8434, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 72.10727969348659, |
|
"grad_norm": 1.9516360759735107, |
|
"learning_rate": 4.951149425287357e-06, |
|
"loss": 12.0732, |
|
"step": 94100 |
|
}, |
|
{ |
|
"epoch": 72.183908045977, |
|
"grad_norm": 1.6725817918777466, |
|
"learning_rate": 4.903256704980843e-06, |
|
"loss": 11.9187, |
|
"step": 94200 |
|
}, |
|
{ |
|
"epoch": 72.26053639846744, |
|
"grad_norm": 1.5325151681900024, |
|
"learning_rate": 4.85536398467433e-06, |
|
"loss": 11.8286, |
|
"step": 94300 |
|
}, |
|
{ |
|
"epoch": 72.33716475095785, |
|
"grad_norm": 1.4346359968185425, |
|
"learning_rate": 4.807471264367816e-06, |
|
"loss": 11.9449, |
|
"step": 94400 |
|
}, |
|
{ |
|
"epoch": 72.41379310344827, |
|
"grad_norm": 1.8294119834899902, |
|
"learning_rate": 4.7595785440613025e-06, |
|
"loss": 11.7885, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 72.4904214559387, |
|
"grad_norm": 3.0054831504821777, |
|
"learning_rate": 4.7116858237547894e-06, |
|
"loss": 11.9011, |
|
"step": 94600 |
|
}, |
|
{ |
|
"epoch": 72.56704980842912, |
|
"grad_norm": 3.023944616317749, |
|
"learning_rate": 4.663793103448276e-06, |
|
"loss": 11.7951, |
|
"step": 94700 |
|
}, |
|
{ |
|
"epoch": 72.64367816091954, |
|
"grad_norm": 1.6727356910705566, |
|
"learning_rate": 4.615900383141763e-06, |
|
"loss": 11.6363, |
|
"step": 94800 |
|
}, |
|
{ |
|
"epoch": 72.72030651340997, |
|
"grad_norm": 2.4141032695770264, |
|
"learning_rate": 4.568007662835249e-06, |
|
"loss": 11.8062, |
|
"step": 94900 |
|
}, |
|
{ |
|
"epoch": 72.79693486590038, |
|
"grad_norm": 1.810632348060608, |
|
"learning_rate": 4.520114942528736e-06, |
|
"loss": 11.7885, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 72.8735632183908, |
|
"grad_norm": 1.2663646936416626, |
|
"learning_rate": 4.472222222222222e-06, |
|
"loss": 11.8532, |
|
"step": 95100 |
|
}, |
|
{ |
|
"epoch": 72.95019157088123, |
|
"grad_norm": 1.1440293788909912, |
|
"learning_rate": 4.424329501915709e-06, |
|
"loss": 11.9398, |
|
"step": 95200 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_loss": 12.724422454833984, |
|
"eval_runtime": 44.1981, |
|
"eval_samples_per_second": 29.526, |
|
"eval_steps_per_second": 3.711, |
|
"step": 95265 |
|
}, |
|
{ |
|
"epoch": 73.02681992337165, |
|
"grad_norm": 1.0655268430709839, |
|
"learning_rate": 4.376436781609196e-06, |
|
"loss": 11.9855, |
|
"step": 95300 |
|
}, |
|
{ |
|
"epoch": 73.10344827586206, |
|
"grad_norm": 1.2701817750930786, |
|
"learning_rate": 4.328544061302682e-06, |
|
"loss": 11.7504, |
|
"step": 95400 |
|
}, |
|
{ |
|
"epoch": 73.1800766283525, |
|
"grad_norm": 1.4740400314331055, |
|
"learning_rate": 4.280651340996169e-06, |
|
"loss": 11.8391, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 73.25670498084291, |
|
"grad_norm": 2.1387853622436523, |
|
"learning_rate": 4.232758620689655e-06, |
|
"loss": 11.8052, |
|
"step": 95600 |
|
}, |
|
{ |
|
"epoch": 73.33333333333333, |
|
"grad_norm": 1.295242190361023, |
|
"learning_rate": 4.184865900383142e-06, |
|
"loss": 11.9859, |
|
"step": 95700 |
|
}, |
|
{ |
|
"epoch": 73.40996168582376, |
|
"grad_norm": 1.4711384773254395, |
|
"learning_rate": 4.136973180076629e-06, |
|
"loss": 12.1523, |
|
"step": 95800 |
|
}, |
|
{ |
|
"epoch": 73.48659003831418, |
|
"grad_norm": 1.7779674530029297, |
|
"learning_rate": 4.089080459770115e-06, |
|
"loss": 11.698, |
|
"step": 95900 |
|
}, |
|
{ |
|
"epoch": 73.5632183908046, |
|
"grad_norm": 2.6070003509521484, |
|
"learning_rate": 4.0411877394636015e-06, |
|
"loss": 11.9877, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 73.63984674329502, |
|
"grad_norm": 1.4775136709213257, |
|
"learning_rate": 3.993295019157088e-06, |
|
"loss": 11.7928, |
|
"step": 96100 |
|
}, |
|
{ |
|
"epoch": 73.71647509578544, |
|
"grad_norm": 1.7105778455734253, |
|
"learning_rate": 3.945402298850575e-06, |
|
"loss": 12.0444, |
|
"step": 96200 |
|
}, |
|
{ |
|
"epoch": 73.79310344827586, |
|
"grad_norm": 1.6719238758087158, |
|
"learning_rate": 3.897988505747126e-06, |
|
"loss": 11.9407, |
|
"step": 96300 |
|
}, |
|
{ |
|
"epoch": 73.86973180076629, |
|
"grad_norm": 1.312474250793457, |
|
"learning_rate": 3.850095785440613e-06, |
|
"loss": 11.7468, |
|
"step": 96400 |
|
}, |
|
{ |
|
"epoch": 73.9463601532567, |
|
"grad_norm": 0.9431168437004089, |
|
"learning_rate": 3.8022030651340995e-06, |
|
"loss": 11.8737, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_loss": 12.720576286315918, |
|
"eval_runtime": 44.1482, |
|
"eval_samples_per_second": 29.56, |
|
"eval_steps_per_second": 3.715, |
|
"step": 96570 |
|
}, |
|
{ |
|
"epoch": 74.02298850574712, |
|
"grad_norm": 1.6064398288726807, |
|
"learning_rate": 3.7543103448275864e-06, |
|
"loss": 11.8828, |
|
"step": 96600 |
|
}, |
|
{ |
|
"epoch": 74.09961685823755, |
|
"grad_norm": 2.088803768157959, |
|
"learning_rate": 3.7064176245210733e-06, |
|
"loss": 11.7576, |
|
"step": 96700 |
|
}, |
|
{ |
|
"epoch": 74.17624521072797, |
|
"grad_norm": 1.5417454242706299, |
|
"learning_rate": 3.6585249042145593e-06, |
|
"loss": 11.9239, |
|
"step": 96800 |
|
}, |
|
{ |
|
"epoch": 74.25287356321839, |
|
"grad_norm": 1.5983319282531738, |
|
"learning_rate": 3.610632183908046e-06, |
|
"loss": 11.8119, |
|
"step": 96900 |
|
}, |
|
{ |
|
"epoch": 74.32950191570882, |
|
"grad_norm": 3.7642099857330322, |
|
"learning_rate": 3.5627394636015326e-06, |
|
"loss": 11.8259, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 74.40613026819923, |
|
"grad_norm": 1.5149072408676147, |
|
"learning_rate": 3.5148467432950195e-06, |
|
"loss": 11.9898, |
|
"step": 97100 |
|
}, |
|
{ |
|
"epoch": 74.48275862068965, |
|
"grad_norm": 0.9915036559104919, |
|
"learning_rate": 3.4669540229885055e-06, |
|
"loss": 11.7665, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 74.55938697318008, |
|
"grad_norm": 1.2745176553726196, |
|
"learning_rate": 3.4190613026819924e-06, |
|
"loss": 11.9657, |
|
"step": 97300 |
|
}, |
|
{ |
|
"epoch": 74.6360153256705, |
|
"grad_norm": 2.390751600265503, |
|
"learning_rate": 3.3711685823754793e-06, |
|
"loss": 11.6856, |
|
"step": 97400 |
|
}, |
|
{ |
|
"epoch": 74.71264367816092, |
|
"grad_norm": 2.2279295921325684, |
|
"learning_rate": 3.3232758620689653e-06, |
|
"loss": 11.7551, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 74.78927203065135, |
|
"grad_norm": 1.8389006853103638, |
|
"learning_rate": 3.275383141762452e-06, |
|
"loss": 12.0037, |
|
"step": 97600 |
|
}, |
|
{ |
|
"epoch": 74.86590038314176, |
|
"grad_norm": 1.4288936853408813, |
|
"learning_rate": 3.2274904214559387e-06, |
|
"loss": 12.0561, |
|
"step": 97700 |
|
}, |
|
{ |
|
"epoch": 74.94252873563218, |
|
"grad_norm": 1.037800669670105, |
|
"learning_rate": 3.1795977011494255e-06, |
|
"loss": 11.9257, |
|
"step": 97800 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_loss": 12.724896430969238, |
|
"eval_runtime": 44.1538, |
|
"eval_samples_per_second": 29.556, |
|
"eval_steps_per_second": 3.714, |
|
"step": 97875 |
|
}, |
|
{ |
|
"epoch": 75.01915708812261, |
|
"grad_norm": 0.9783554673194885, |
|
"learning_rate": 3.1317049808429124e-06, |
|
"loss": 11.7455, |
|
"step": 97900 |
|
}, |
|
{ |
|
"epoch": 75.09578544061303, |
|
"grad_norm": 1.4434301853179932, |
|
"learning_rate": 3.0838122605363985e-06, |
|
"loss": 11.99, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 75.17241379310344, |
|
"grad_norm": 1.2560200691223145, |
|
"learning_rate": 3.035919540229885e-06, |
|
"loss": 11.8445, |
|
"step": 98100 |
|
}, |
|
{ |
|
"epoch": 75.24904214559388, |
|
"grad_norm": 1.123687982559204, |
|
"learning_rate": 2.988026819923372e-06, |
|
"loss": 11.8894, |
|
"step": 98200 |
|
}, |
|
{ |
|
"epoch": 75.32567049808429, |
|
"grad_norm": 1.2393250465393066, |
|
"learning_rate": 2.9401340996168583e-06, |
|
"loss": 11.7591, |
|
"step": 98300 |
|
}, |
|
{ |
|
"epoch": 75.40229885057471, |
|
"grad_norm": 2.023070812225342, |
|
"learning_rate": 2.892241379310345e-06, |
|
"loss": 11.7083, |
|
"step": 98400 |
|
}, |
|
{ |
|
"epoch": 75.47892720306514, |
|
"grad_norm": 1.7746585607528687, |
|
"learning_rate": 2.8443486590038316e-06, |
|
"loss": 12.0237, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 75.55555555555556, |
|
"grad_norm": 1.6215800046920776, |
|
"learning_rate": 2.796455938697318e-06, |
|
"loss": 11.8271, |
|
"step": 98600 |
|
}, |
|
{ |
|
"epoch": 75.63218390804597, |
|
"grad_norm": 2.3727614879608154, |
|
"learning_rate": 2.7490421455938698e-06, |
|
"loss": 11.9133, |
|
"step": 98700 |
|
}, |
|
{ |
|
"epoch": 75.7088122605364, |
|
"grad_norm": 1.562569260597229, |
|
"learning_rate": 2.7011494252873562e-06, |
|
"loss": 11.8886, |
|
"step": 98800 |
|
}, |
|
{ |
|
"epoch": 75.78544061302682, |
|
"grad_norm": 0.8996521830558777, |
|
"learning_rate": 2.653256704980843e-06, |
|
"loss": 11.6606, |
|
"step": 98900 |
|
}, |
|
{ |
|
"epoch": 75.86206896551724, |
|
"grad_norm": 1.6331411600112915, |
|
"learning_rate": 2.6053639846743296e-06, |
|
"loss": 12.057, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 75.93869731800767, |
|
"grad_norm": 1.2690104246139526, |
|
"learning_rate": 2.5574712643678165e-06, |
|
"loss": 11.9791, |
|
"step": 99100 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_loss": 12.717323303222656, |
|
"eval_runtime": 44.1546, |
|
"eval_samples_per_second": 29.555, |
|
"eval_steps_per_second": 3.714, |
|
"step": 99180 |
|
}, |
|
{ |
|
"epoch": 76.01532567049809, |
|
"grad_norm": 1.737823724746704, |
|
"learning_rate": 2.509578544061303e-06, |
|
"loss": 11.8981, |
|
"step": 99200 |
|
}, |
|
{ |
|
"epoch": 76.0919540229885, |
|
"grad_norm": 1.0878353118896484, |
|
"learning_rate": 2.4616858237547894e-06, |
|
"loss": 11.8443, |
|
"step": 99300 |
|
}, |
|
{ |
|
"epoch": 76.16858237547893, |
|
"grad_norm": 2.0454564094543457, |
|
"learning_rate": 2.413793103448276e-06, |
|
"loss": 11.8515, |
|
"step": 99400 |
|
}, |
|
{ |
|
"epoch": 76.24521072796935, |
|
"grad_norm": 1.3210684061050415, |
|
"learning_rate": 2.3659003831417623e-06, |
|
"loss": 12.0233, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 76.32183908045977, |
|
"grad_norm": 1.1547104120254517, |
|
"learning_rate": 2.318007662835249e-06, |
|
"loss": 11.7145, |
|
"step": 99600 |
|
}, |
|
{ |
|
"epoch": 76.3984674329502, |
|
"grad_norm": 1.3948626518249512, |
|
"learning_rate": 2.270114942528736e-06, |
|
"loss": 11.7098, |
|
"step": 99700 |
|
}, |
|
{ |
|
"epoch": 76.47509578544062, |
|
"grad_norm": 1.2874501943588257, |
|
"learning_rate": 2.2222222222222225e-06, |
|
"loss": 11.8953, |
|
"step": 99800 |
|
}, |
|
{ |
|
"epoch": 76.55172413793103, |
|
"grad_norm": 1.8570905923843384, |
|
"learning_rate": 2.174329501915709e-06, |
|
"loss": 11.9397, |
|
"step": 99900 |
|
}, |
|
{ |
|
"epoch": 76.62835249042146, |
|
"grad_norm": 1.3673057556152344, |
|
"learning_rate": 2.1264367816091954e-06, |
|
"loss": 11.8056, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 76.70498084291188, |
|
"grad_norm": 2.1938419342041016, |
|
"learning_rate": 2.078544061302682e-06, |
|
"loss": 11.9414, |
|
"step": 100100 |
|
}, |
|
{ |
|
"epoch": 76.7816091954023, |
|
"grad_norm": 1.9171061515808105, |
|
"learning_rate": 2.0306513409961687e-06, |
|
"loss": 11.8369, |
|
"step": 100200 |
|
}, |
|
{ |
|
"epoch": 76.85823754789271, |
|
"grad_norm": 1.0486401319503784, |
|
"learning_rate": 1.982758620689655e-06, |
|
"loss": 11.8322, |
|
"step": 100300 |
|
}, |
|
{ |
|
"epoch": 76.93486590038314, |
|
"grad_norm": 1.6005215644836426, |
|
"learning_rate": 1.934865900383142e-06, |
|
"loss": 11.8781, |
|
"step": 100400 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_loss": 12.72097396850586, |
|
"eval_runtime": 44.1751, |
|
"eval_samples_per_second": 29.542, |
|
"eval_steps_per_second": 3.712, |
|
"step": 100485 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 104400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 80, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 10, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 9 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.681650983960218e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|