Upload 10 files

Browse files

Files changed (8) hide show

config.json +2 -2
model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
tokenizer_config.json +1 -1
trainer_state.json +2577 -3
training_args.bin +2 -2

config.json CHANGED Viewed

@@ -14,6 +14,7 @@
   "cls_token_id": 50281,
   "decoder_bias": true,
   "deterministic_flash_attn": false,
   "embedding_dropout": 0.0,
   "eos_token_id": 50282,
   "global_attn_every_n_layers": 3,
@@ -42,7 +43,6 @@
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.51.3",
   "vocab_size": 50368
 }

   "cls_token_id": 50281,
   "decoder_bias": true,
   "deterministic_flash_attn": false,
+  "dtype": "float32",
   "embedding_dropout": 0.0,
   "eos_token_id": 50282,
   "global_attn_every_n_layers": 3,
   "sep_token_id": 50282,
   "sparse_pred_ignore_index": -100,
   "sparse_prediction": false,
+  "transformers_version": "4.56.1",
   "vocab_size": 50368
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50accac5965491b2be88f637936f4d01b489952706122fe0d135ecab30a39e80
 size 598635032

 version https://git-lfs.github.com/spec/v1
+oid sha256:4baee20d911bded3ac972714a9c339be4051aac75f3be17c5dd47c3bb0a04e63
 size 598635032

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aab4c8f0590b9324c064c0ed46942e320d9a59713b8f281918dcd2d85799abe0
 size 1197359627

 version https://git-lfs.github.com/spec/v1
+oid sha256:dffb20f09d581f1a8db94110ac7014fac958626dee3c29e960da4cb1c9f38e85
 size 1197359627

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a0c558605c8d82b7652608db5c6f7c38916ecd8a2b4b556a203e1542326fcd7
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:db4c787397c7bd17a5fb6bef85caf0ed539cdf41b0fe201e17b766ab049c2a38
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4e558df6977b3affd18572e35774f9ec32c672991e5e862c77249ff31a4e9ad
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:528826677c74cc85ac3103f0a7ddc5d791ae235096533cc53310113c112a4947
 size 1465

tokenizer_config.json CHANGED Viewed

@@ -940,6 +940,6 @@
   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "tokenizer_class": "PreTrainedTokenizer",
   "unk_token": "[UNK]"
 }

   "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "[UNK]"
 }

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0027908170954292,
   "eval_steps": 1000,
-  "global_step": 134000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -10467,6 +10467,2580 @@
       "eval_samples_per_second": 196.558,
       "eval_steps_per_second": 1.543,
       "step": 134000
     }
   ],
   "logging_steps": 100,
@@ -10486,7 +13060,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.1694502947323904e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.0195357196680044,
   "eval_steps": 1000,
+  "global_step": 167000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 196.558,
       "eval_steps_per_second": 1.543,
       "step": 134000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 2.9615020751953125,
+      "learning_rate": 3.528037405343427e-05,
+      "loss": 3.6576,
+      "step": 134100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 3.0420079231262207,
+      "learning_rate": 3.52601881252502e-05,
+      "loss": 3.6607,
+      "step": 134200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 2.960287094116211,
+      "learning_rate": 3.5239994149780645e-05,
+      "loss": 3.6668,
+      "step": 134300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 3.181061267852783,
+      "learning_rate": 3.521979214286417e-05,
+      "loss": 3.6564,
+      "step": 134400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 2.98903489112854,
+      "learning_rate": 3.519958212034564e-05,
+      "loss": 3.6662,
+      "step": 134500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 3.0481224060058594,
+      "learning_rate": 3.5179364098076216e-05,
+      "loss": 3.5675,
+      "step": 134600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 3.1568892002105713,
+      "learning_rate": 3.5159138091913325e-05,
+      "loss": 3.6681,
+      "step": 134700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 3.0683400630950928,
+      "learning_rate": 3.5138904117720653e-05,
+      "loss": 3.6584,
+      "step": 134800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 3.077857494354248,
+      "learning_rate": 3.511866219136814e-05,
+      "loss": 3.6734,
+      "step": 134900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 3.012407064437866,
+      "learning_rate": 3.509841232873195e-05,
+      "loss": 3.6649,
+      "step": 135000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.210810661315918,
+      "eval_runtime": 51.4646,
+      "eval_samples_per_second": 198.078,
+      "eval_steps_per_second": 1.554,
+      "step": 135000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 3.164175033569336,
+      "learning_rate": 3.507815454569451e-05,
+      "loss": 3.6716,
+      "step": 135100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 3.1793689727783203,
+      "learning_rate": 3.5057888858144416e-05,
+      "loss": 3.643,
+      "step": 135200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 3.0864334106445312,
+      "learning_rate": 3.5037615281976495e-05,
+      "loss": 3.6401,
+      "step": 135300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 3.1052911281585693,
+      "learning_rate": 3.501733383309174e-05,
+      "loss": 3.6583,
+      "step": 135400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 3.1001381874084473,
+      "learning_rate": 3.499704452739732e-05,
+      "loss": 3.6582,
+      "step": 135500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 3.0288331508636475,
+      "learning_rate": 3.4976747380806574e-05,
+      "loss": 3.6652,
+      "step": 135600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 2.998553991317749,
+      "learning_rate": 3.4956442409238986e-05,
+      "loss": 3.6602,
+      "step": 135700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 3.052278757095337,
+      "learning_rate": 3.49361296286202e-05,
+      "loss": 3.6572,
+      "step": 135800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 2.945789098739624,
+      "learning_rate": 3.491580905488195e-05,
+      "loss": 3.643,
+      "step": 135900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 2.888101577758789,
+      "learning_rate": 3.48954807039621e-05,
+      "loss": 3.6616,
+      "step": 136000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.203134298324585,
+      "eval_runtime": 51.3238,
+      "eval_samples_per_second": 198.621,
+      "eval_steps_per_second": 1.559,
+      "step": 136000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 3.194450855255127,
+      "learning_rate": 3.487514459180461e-05,
+      "loss": 3.6524,
+      "step": 136100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 2.904616355895996,
+      "learning_rate": 3.485480073435953e-05,
+      "loss": 3.6361,
+      "step": 136200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 3.003732919692993,
+      "learning_rate": 3.483444914758298e-05,
+      "loss": 3.6364,
+      "step": 136300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 3.1838271617889404,
+      "learning_rate": 3.481408984743716e-05,
+      "loss": 3.6386,
+      "step": 136400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 3.0291504859924316,
+      "learning_rate": 3.479372284989028e-05,
+      "loss": 3.6795,
+      "step": 136500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 3.087804079055786,
+      "learning_rate": 3.477334817091664e-05,
+      "loss": 3.6999,
+      "step": 136600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 3.039384365081787,
+      "learning_rate": 3.475296582649652e-05,
+      "loss": 3.7043,
+      "step": 136700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 3.3074495792388916,
+      "learning_rate": 3.4732575832616235e-05,
+      "loss": 3.6944,
+      "step": 136800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 2.9702234268188477,
+      "learning_rate": 3.471217820526808e-05,
+      "loss": 3.7179,
+      "step": 136900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 3.0739312171936035,
+      "learning_rate": 3.469177296045039e-05,
+      "loss": 3.706,
+      "step": 137000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.2038073539733887,
+      "eval_runtime": 51.4208,
+      "eval_samples_per_second": 198.247,
+      "eval_steps_per_second": 1.556,
+      "step": 137000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 3.1482625007629395,
+      "learning_rate": 3.4671360114167395e-05,
+      "loss": 3.6934,
+      "step": 137100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 3.0761804580688477,
+      "learning_rate": 3.465093968242935e-05,
+      "loss": 3.7073,
+      "step": 137200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 3.0399630069732666,
+      "learning_rate": 3.463051168125243e-05,
+      "loss": 3.6919,
+      "step": 137300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 2.898616075515747,
+      "learning_rate": 3.4610076126658765e-05,
+      "loss": 3.4094,
+      "step": 137400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 3.054553985595703,
+      "learning_rate": 3.458963303467638e-05,
+      "loss": 3.709,
+      "step": 137500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 3.0647337436676025,
+      "learning_rate": 3.456918242133924e-05,
+      "loss": 3.6935,
+      "step": 137600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 2.9094271659851074,
+      "learning_rate": 3.45487243026872e-05,
+      "loss": 3.709,
+      "step": 137700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 3.0624611377716064,
+      "learning_rate": 3.4528258694766e-05,
+      "loss": 3.7097,
+      "step": 137800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 3.080920696258545,
+      "learning_rate": 3.4507785613627246e-05,
+      "loss": 3.7166,
+      "step": 137900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 3.08603835105896,
+      "learning_rate": 3.4487305075328434e-05,
+      "loss": 3.6971,
+      "step": 138000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.199204683303833,
+      "eval_runtime": 51.4121,
+      "eval_samples_per_second": 198.28,
+      "eval_steps_per_second": 1.556,
+      "step": 138000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 2.937602996826172,
+      "learning_rate": 3.446681709593288e-05,
+      "loss": 3.6892,
+      "step": 138100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 3.0739290714263916,
+      "learning_rate": 3.444632169150974e-05,
+      "loss": 3.6923,
+      "step": 138200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 3.076711654663086,
+      "learning_rate": 3.4425818878134006e-05,
+      "loss": 3.6838,
+      "step": 138300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 3.1818246841430664,
+      "learning_rate": 3.4405308671886465e-05,
+      "loss": 3.7162,
+      "step": 138400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 3.0970702171325684,
+      "learning_rate": 3.438479108885372e-05,
+      "loss": 3.6906,
+      "step": 138500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 2.924048662185669,
+      "learning_rate": 3.436426614512815e-05,
+      "loss": 3.688,
+      "step": 138600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 3.151340961456299,
+      "learning_rate": 3.434373385680791e-05,
+      "loss": 3.6952,
+      "step": 138700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 3.1132941246032715,
+      "learning_rate": 3.4323194239996906e-05,
+      "loss": 3.6774,
+      "step": 138800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 3.077354907989502,
+      "learning_rate": 3.43026473108048e-05,
+      "loss": 3.7007,
+      "step": 138900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 3.079162120819092,
+      "learning_rate": 3.4282093085347e-05,
+      "loss": 3.6982,
+      "step": 139000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.199798107147217,
+      "eval_runtime": 51.6229,
+      "eval_samples_per_second": 197.47,
+      "eval_steps_per_second": 1.55,
+      "step": 139000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 3.0697269439697266,
+      "learning_rate": 3.426153157974462e-05,
+      "loss": 3.6903,
+      "step": 139100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 2.9596915245056152,
+      "learning_rate": 3.4240962810124485e-05,
+      "loss": 3.6961,
+      "step": 139200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 3.1975579261779785,
+      "learning_rate": 3.4220386792619134e-05,
+      "loss": 3.6893,
+      "step": 139300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 3.0534658432006836,
+      "learning_rate": 3.419980354336677e-05,
+      "loss": 3.6867,
+      "step": 139400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 3.014533519744873,
+      "learning_rate": 3.4179213078511276e-05,
+      "loss": 3.6807,
+      "step": 139500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 3.1853954792022705,
+      "learning_rate": 3.415861541420219e-05,
+      "loss": 3.6836,
+      "step": 139600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 2.994035482406616,
+      "learning_rate": 3.413801056659471e-05,
+      "loss": 3.6843,
+      "step": 139700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 3.01277232170105,
+      "learning_rate": 3.411739855184966e-05,
+      "loss": 3.6875,
+      "step": 139800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 2.956777334213257,
+      "learning_rate": 3.409677938613348e-05,
+      "loss": 3.6708,
+      "step": 139900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 3.136683225631714,
+      "learning_rate": 3.407615308561822e-05,
+      "loss": 3.6853,
+      "step": 140000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.1988320350646973,
+      "eval_runtime": 51.6562,
+      "eval_samples_per_second": 197.343,
+      "eval_steps_per_second": 1.549,
+      "step": 140000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 3.045262575149536,
+      "learning_rate": 3.405551966648155e-05,
+      "loss": 3.6856,
+      "step": 140100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 3.14791202545166,
+      "learning_rate": 3.4034879144906674e-05,
+      "loss": 3.6802,
+      "step": 140200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 2.9139297008514404,
+      "learning_rate": 3.401423153708242e-05,
+      "loss": 3.6717,
+      "step": 140300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 3.0042569637298584,
+      "learning_rate": 3.399357685920314e-05,
+      "loss": 3.6752,
+      "step": 140400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 3.045513153076172,
+      "learning_rate": 3.397291512746873e-05,
+      "loss": 3.6921,
+      "step": 140500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 3.0931684970855713,
+      "learning_rate": 3.3952246358084645e-05,
+      "loss": 3.6733,
+      "step": 140600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 3.1561226844787598,
+      "learning_rate": 3.393157056726184e-05,
+      "loss": 3.6702,
+      "step": 140700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 3.267413854598999,
+      "learning_rate": 3.391088777121678e-05,
+      "loss": 3.6848,
+      "step": 140800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 2.9542646408081055,
+      "learning_rate": 3.3890197986171426e-05,
+      "loss": 3.668,
+      "step": 140900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 3.1285250186920166,
+      "learning_rate": 3.386950122835321e-05,
+      "loss": 3.6633,
+      "step": 141000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.194960355758667,
+      "eval_runtime": 51.7975,
+      "eval_samples_per_second": 196.805,
+      "eval_steps_per_second": 1.544,
+      "step": 141000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 3.117668628692627,
+      "learning_rate": 3.3848797513995054e-05,
+      "loss": 3.6846,
+      "step": 141100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 3.2943692207336426,
+      "learning_rate": 3.3828086859335326e-05,
+      "loss": 3.6798,
+      "step": 141200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 3.078214645385742,
+      "learning_rate": 3.3807369280617834e-05,
+      "loss": 3.6393,
+      "step": 141300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 2.948484182357788,
+      "learning_rate": 3.3786644794091816e-05,
+      "loss": 3.6748,
+      "step": 141400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 3.035909652709961,
+      "learning_rate": 3.3765913416011935e-05,
+      "loss": 3.6745,
+      "step": 141500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 2.9795923233032227,
+      "learning_rate": 3.374517516263824e-05,
+      "loss": 3.6788,
+      "step": 141600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 3.133145809173584,
+      "learning_rate": 3.372443005023622e-05,
+      "loss": 3.6672,
+      "step": 141700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 3.067864179611206,
+      "learning_rate": 3.370367809507668e-05,
+      "loss": 3.6433,
+      "step": 141800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 2.9287595748901367,
+      "learning_rate": 3.3682919313435836e-05,
+      "loss": 3.6574,
+      "step": 141900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 2.9526288509368896,
+      "learning_rate": 3.3662153721595244e-05,
+      "loss": 3.658,
+      "step": 142000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.188488483428955,
+      "eval_runtime": 51.6355,
+      "eval_samples_per_second": 197.422,
+      "eval_steps_per_second": 1.549,
+      "step": 142000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 3.129251718521118,
+      "learning_rate": 3.36413813358418e-05,
+      "loss": 3.6714,
+      "step": 142100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 3.0163700580596924,
+      "learning_rate": 3.362060217246775e-05,
+      "loss": 3.662,
+      "step": 142200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 3.160065174102783,
+      "learning_rate": 3.359981624777061e-05,
+      "loss": 3.6398,
+      "step": 142300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 3.0241780281066895,
+      "learning_rate": 3.3579023578053245e-05,
+      "loss": 3.6516,
+      "step": 142400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 2.963665008544922,
+      "learning_rate": 3.355822417962378e-05,
+      "loss": 3.6691,
+      "step": 142500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 3.1197776794433594,
+      "learning_rate": 3.3537418068795634e-05,
+      "loss": 3.6647,
+      "step": 142600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 3.207745313644409,
+      "learning_rate": 3.3516605261887494e-05,
+      "loss": 3.6587,
+      "step": 142700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 3.1258158683776855,
+      "learning_rate": 3.3495785775223274e-05,
+      "loss": 3.6582,
+      "step": 142800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 3.1733665466308594,
+      "learning_rate": 3.347495962513215e-05,
+      "loss": 3.6611,
+      "step": 142900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 3.0605976581573486,
+      "learning_rate": 3.345412682794853e-05,
+      "loss": 3.6533,
+      "step": 143000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.192350149154663,
+      "eval_runtime": 51.7936,
+      "eval_samples_per_second": 196.82,
+      "eval_steps_per_second": 1.545,
+      "step": 143000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 3.114699363708496,
+      "learning_rate": 3.3433287400012e-05,
+      "loss": 3.637,
+      "step": 143100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 3.0836739540100098,
+      "learning_rate": 3.34124413576674e-05,
+      "loss": 3.6546,
+      "step": 143200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 3.009408950805664,
+      "learning_rate": 3.33915887172647e-05,
+      "loss": 3.6605,
+      "step": 143300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 3.173821210861206,
+      "learning_rate": 3.337072949515909e-05,
+      "loss": 3.6607,
+      "step": 143400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 3.0830142498016357,
+      "learning_rate": 3.334986370771089e-05,
+      "loss": 3.6414,
+      "step": 143500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 3.06382155418396,
+      "learning_rate": 3.3328991371285604e-05,
+      "loss": 3.6384,
+      "step": 143600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 3.039879083633423,
+      "learning_rate": 3.3308112502253844e-05,
+      "loss": 3.6414,
+      "step": 143700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 3.145960807800293,
+      "learning_rate": 3.3287227116991346e-05,
+      "loss": 3.6554,
+      "step": 143800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 2.9724478721618652,
+      "learning_rate": 3.326633523187897e-05,
+      "loss": 3.6537,
+      "step": 143900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 3.0227105617523193,
+      "learning_rate": 3.324543686330268e-05,
+      "loss": 3.6496,
+      "step": 144000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.199296236038208,
+      "eval_runtime": 51.7247,
+      "eval_samples_per_second": 197.082,
+      "eval_steps_per_second": 1.547,
+      "step": 144000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 3.1452724933624268,
+      "learning_rate": 3.3224532027653506e-05,
+      "loss": 3.6534,
+      "step": 144100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 3.0349013805389404,
+      "learning_rate": 3.3203620741327555e-05,
+      "loss": 3.6355,
+      "step": 144200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 2.9786078929901123,
+      "learning_rate": 3.3182703020726e-05,
+      "loss": 3.6582,
+      "step": 144300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 3.17039155960083,
+      "learning_rate": 3.316177888225506e-05,
+      "loss": 3.6421,
+      "step": 144400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 2.8847384452819824,
+      "learning_rate": 3.3140848342325985e-05,
+      "loss": 3.6547,
+      "step": 144500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 2.9963126182556152,
+      "learning_rate": 3.3119911417355045e-05,
+      "loss": 3.6473,
+      "step": 144600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 3.042747974395752,
+      "learning_rate": 3.309896812376353e-05,
+      "loss": 3.6501,
+      "step": 144700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 3.0630815029144287,
+      "learning_rate": 3.307801847797769e-05,
+      "loss": 3.6571,
+      "step": 144800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 3.174445390701294,
+      "learning_rate": 3.30570624964288e-05,
+      "loss": 3.6574,
+      "step": 144900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 3.1276638507843018,
+      "learning_rate": 3.3036100195553074e-05,
+      "loss": 3.6543,
+      "step": 145000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.1886751651763916,
+      "eval_runtime": 51.7134,
+      "eval_samples_per_second": 197.125,
+      "eval_steps_per_second": 1.547,
+      "step": 145000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 2.9898717403411865,
+      "learning_rate": 3.3015131591791705e-05,
+      "loss": 3.6664,
+      "step": 145100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 3.03507137298584,
+      "learning_rate": 3.2994156701590813e-05,
+      "loss": 3.6707,
+      "step": 145200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 2.995556354522705,
+      "learning_rate": 3.297317554140146e-05,
+      "loss": 3.6656,
+      "step": 145300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 3.1159939765930176,
+      "learning_rate": 3.295218812767961e-05,
+      "loss": 3.6558,
+      "step": 145400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 2.9724996089935303,
+      "learning_rate": 3.293119447688615e-05,
+      "loss": 3.6455,
+      "step": 145500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 3.123499631881714,
+      "learning_rate": 3.291019460548684e-05,
+      "loss": 3.6437,
+      "step": 145600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 3.0609242916107178,
+      "learning_rate": 3.2889188529952334e-05,
+      "loss": 3.6567,
+      "step": 145700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 3.1065030097961426,
+      "learning_rate": 3.286817626675815e-05,
+      "loss": 3.6503,
+      "step": 145800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 3.0567867755889893,
+      "learning_rate": 3.284715783238466e-05,
+      "loss": 3.6493,
+      "step": 145900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 2.944715738296509,
+      "learning_rate": 3.282613324331707e-05,
+      "loss": 3.663,
+      "step": 146000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.181875467300415,
+      "eval_runtime": 51.9227,
+      "eval_samples_per_second": 196.33,
+      "eval_steps_per_second": 1.541,
+      "step": 146000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 3.1367740631103516,
+      "learning_rate": 3.280510251604541e-05,
+      "loss": 3.6419,
+      "step": 146100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 3.077601671218872,
+      "learning_rate": 3.2784065667064536e-05,
+      "loss": 3.661,
+      "step": 146200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 3.0808331966400146,
+      "learning_rate": 3.2763022712874094e-05,
+      "loss": 3.6409,
+      "step": 146300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 2.791093111038208,
+      "learning_rate": 3.274197366997852e-05,
+      "loss": 3.6515,
+      "step": 146400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 3.005890369415283,
+      "learning_rate": 3.272091855488705e-05,
+      "loss": 3.6402,
+      "step": 146500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 3.0411083698272705,
+      "learning_rate": 3.2699857384113644e-05,
+      "loss": 3.6484,
+      "step": 146600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 2.9706947803497314,
+      "learning_rate": 3.267879017417705e-05,
+      "loss": 3.6431,
+      "step": 146700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 2.8929619789123535,
+      "learning_rate": 3.2657716941600694e-05,
+      "loss": 3.6325,
+      "step": 146800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 3.0911691188812256,
+      "learning_rate": 3.2636637702912805e-05,
+      "loss": 3.6321,
+      "step": 146900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 3.0026369094848633,
+      "learning_rate": 3.261555247464626e-05,
+      "loss": 3.6279,
+      "step": 147000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.1829118728637695,
+      "eval_runtime": 52.5212,
+      "eval_samples_per_second": 194.093,
+      "eval_steps_per_second": 1.523,
+      "step": 147000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 3.0439035892486572,
+      "learning_rate": 3.259446127333865e-05,
+      "loss": 3.6467,
+      "step": 147100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 2.952643871307373,
+      "learning_rate": 3.2573364115532276e-05,
+      "loss": 3.6524,
+      "step": 147200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 3.039597988128662,
+      "learning_rate": 3.2552261017774075e-05,
+      "loss": 3.6339,
+      "step": 147300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 3.1887271404266357,
+      "learning_rate": 3.253115199661567e-05,
+      "loss": 3.6367,
+      "step": 147400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 3.123321056365967,
+      "learning_rate": 3.2510037068613314e-05,
+      "loss": 3.6283,
+      "step": 147500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 3.1954147815704346,
+      "learning_rate": 3.248891625032789e-05,
+      "loss": 3.6295,
+      "step": 147600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 3.2092411518096924,
+      "learning_rate": 3.246778955832493e-05,
+      "loss": 3.6417,
+      "step": 147700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 3.2568812370300293,
+      "learning_rate": 3.2446657009174523e-05,
+      "loss": 3.6327,
+      "step": 147800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 3.068138837814331,
+      "learning_rate": 3.242551861945141e-05,
+      "loss": 3.6543,
+      "step": 147900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 3.317512273788452,
+      "learning_rate": 3.240437440573485e-05,
+      "loss": 3.6408,
+      "step": 148000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.1870715618133545,
+      "eval_runtime": 52.1184,
+      "eval_samples_per_second": 195.593,
+      "eval_steps_per_second": 1.535,
+      "step": 148000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 3.3598294258117676,
+      "learning_rate": 3.238322438460874e-05,
+      "loss": 3.6164,
+      "step": 148100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 3.139274835586548,
+      "learning_rate": 3.2362068572661465e-05,
+      "loss": 3.6436,
+      "step": 148200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 3.0762457847595215,
+      "learning_rate": 3.234090698648599e-05,
+      "loss": 3.6247,
+      "step": 148300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 3.061337947845459,
+      "learning_rate": 3.2319739642679806e-05,
+      "loss": 3.623,
+      "step": 148400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 2.983355760574341,
+      "learning_rate": 3.229856655784491e-05,
+      "loss": 3.6257,
+      "step": 148500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 3.085252523422241,
+      "learning_rate": 3.227738774858782e-05,
+      "loss": 3.6421,
+      "step": 148600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 3.194308042526245,
+      "learning_rate": 3.225620323151951e-05,
+      "loss": 3.6212,
+      "step": 148700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 2.822134494781494,
+      "learning_rate": 3.223501302325546e-05,
+      "loss": 3.6332,
+      "step": 148800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 3.303119421005249,
+      "learning_rate": 3.2213817140415606e-05,
+      "loss": 3.6295,
+      "step": 148900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 3.2773683071136475,
+      "learning_rate": 3.219261559962433e-05,
+      "loss": 3.637,
+      "step": 149000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.179696798324585,
+      "eval_runtime": 52.2402,
+      "eval_samples_per_second": 195.137,
+      "eval_steps_per_second": 1.531,
+      "step": 149000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 3.0133464336395264,
+      "learning_rate": 3.217140841751045e-05,
+      "loss": 3.6203,
+      "step": 149100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 2.9925966262817383,
+      "learning_rate": 3.215019561070723e-05,
+      "loss": 3.6204,
+      "step": 149200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 3.0842456817626953,
+      "learning_rate": 3.2128977195852314e-05,
+      "loss": 3.6303,
+      "step": 149300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 3.073462724685669,
+      "learning_rate": 3.210775318958776e-05,
+      "loss": 3.6235,
+      "step": 149400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 3.0209946632385254,
+      "learning_rate": 3.208652360856002e-05,
+      "loss": 3.6212,
+      "step": 149500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 3.250084161758423,
+      "learning_rate": 3.2065288469419906e-05,
+      "loss": 3.6139,
+      "step": 149600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 3.0430448055267334,
+      "learning_rate": 3.204404778882258e-05,
+      "loss": 3.6206,
+      "step": 149700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 3.081878662109375,
+      "learning_rate": 3.20228015834276e-05,
+      "loss": 3.6167,
+      "step": 149800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 3.110133171081543,
+      "learning_rate": 3.2001549869898774e-05,
+      "loss": 3.627,
+      "step": 149900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 3.1069679260253906,
+      "learning_rate": 3.198029266490431e-05,
+      "loss": 3.6122,
+      "step": 150000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.1840949058532715,
+      "eval_runtime": 52.0857,
+      "eval_samples_per_second": 195.716,
+      "eval_steps_per_second": 1.536,
+      "step": 150000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 3.110675096511841,
+      "learning_rate": 3.195902998511666e-05,
+      "loss": 3.6101,
+      "step": 150100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 3.100144386291504,
+      "learning_rate": 3.193776184721263e-05,
+      "loss": 3.6098,
+      "step": 150200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 3.0613949298858643,
+      "learning_rate": 3.191648826787326e-05,
+      "loss": 3.5987,
+      "step": 150300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 2.962594747543335,
+      "learning_rate": 3.189520926378388e-05,
+      "loss": 3.6353,
+      "step": 150400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 3.2426040172576904,
+      "learning_rate": 3.187392485163406e-05,
+      "loss": 3.6268,
+      "step": 150500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 3.0770397186279297,
+      "learning_rate": 3.1852635048117634e-05,
+      "loss": 3.6132,
+      "step": 150600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 3.1057562828063965,
+      "learning_rate": 3.183133986993265e-05,
+      "loss": 3.6077,
+      "step": 150700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 3.1398537158966064,
+      "learning_rate": 3.181003933378136e-05,
+      "loss": 3.5958,
+      "step": 150800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 3.1277642250061035,
+      "learning_rate": 3.178873345637023e-05,
+      "loss": 3.6304,
+      "step": 150900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 3.247443675994873,
+      "learning_rate": 3.176742225440994e-05,
+      "loss": 3.6196,
+      "step": 151000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.1872923374176025,
+      "eval_runtime": 52.3067,
+      "eval_samples_per_second": 194.889,
+      "eval_steps_per_second": 1.529,
+      "step": 151000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 3.074709177017212,
+      "learning_rate": 3.17461057446153e-05,
+      "loss": 3.6314,
+      "step": 151100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 3.163147211074829,
+      "learning_rate": 3.1724783943705304e-05,
+      "loss": 3.6013,
+      "step": 151200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 3.062178373336792,
+      "learning_rate": 3.1703456868403126e-05,
+      "loss": 3.6219,
+      "step": 151300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 2.8890295028686523,
+      "learning_rate": 3.168212453543601e-05,
+      "loss": 3.6319,
+      "step": 151400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 2.8499069213867188,
+      "learning_rate": 3.166078696153539e-05,
+      "loss": 3.615,
+      "step": 151500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 3.079871892929077,
+      "learning_rate": 3.163944416343677e-05,
+      "loss": 3.5953,
+      "step": 151600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 3.028519868850708,
+      "learning_rate": 3.1618096157879776e-05,
+      "loss": 3.6217,
+      "step": 151700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 3.1988399028778076,
+      "learning_rate": 3.159674296160809e-05,
+      "loss": 3.6,
+      "step": 151800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 3.1502091884613037,
+      "learning_rate": 3.157538459136949e-05,
+      "loss": 3.6181,
+      "step": 151900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 3.1333131790161133,
+      "learning_rate": 3.1554021063915806e-05,
+      "loss": 3.6065,
+      "step": 152000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.1706299781799316,
+      "eval_runtime": 52.2255,
+      "eval_samples_per_second": 195.192,
+      "eval_steps_per_second": 1.532,
+      "step": 152000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 3.1799027919769287,
+      "learning_rate": 3.153265239600291e-05,
+      "loss": 3.6177,
+      "step": 152100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 3.09015154838562,
+      "learning_rate": 3.1511278604390694e-05,
+      "loss": 3.6111,
+      "step": 152200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 3.2853939533233643,
+      "learning_rate": 3.1489899705843094e-05,
+      "loss": 3.6164,
+      "step": 152300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 3.114593982696533,
+      "learning_rate": 3.146851571712804e-05,
+      "loss": 3.5874,
+      "step": 152400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 3.1264195442199707,
+      "learning_rate": 3.1447126655017446e-05,
+      "loss": 3.6051,
+      "step": 152500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 3.064248561859131,
+      "learning_rate": 3.142573253628721e-05,
+      "loss": 3.5926,
+      "step": 152600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 3.199820041656494,
+      "learning_rate": 3.140433337771721e-05,
+      "loss": 3.6214,
+      "step": 152700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 3.1903645992279053,
+      "learning_rate": 3.138292919609125e-05,
+      "loss": 3.602,
+      "step": 152800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 3.2537660598754883,
+      "learning_rate": 3.13615200081971e-05,
+      "loss": 3.618,
+      "step": 152900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 3.051445722579956,
+      "learning_rate": 3.134010583082643e-05,
+      "loss": 3.5982,
+      "step": 153000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.1735916137695312,
+      "eval_runtime": 52.2439,
+      "eval_samples_per_second": 195.123,
+      "eval_steps_per_second": 1.531,
+      "step": 153000
+    },
+    {
+      "epoch": 0.01981480137754732,
+      "grad_norm": 2.984973192214966,
+      "learning_rate": 3.131868668077486e-05,
+      "loss": 3.5892,
+      "step": 153100
+    },
+    {
+      "epoch": 0.020093883087090238,
+      "grad_norm": 2.9883575439453125,
+      "learning_rate": 3.129726257484187e-05,
+      "loss": 3.6092,
+      "step": 153200
+    },
+    {
+      "epoch": 0.02037296479663316,
+      "grad_norm": 3.299248695373535,
+      "learning_rate": 3.127583352983086e-05,
+      "loss": 3.5973,
+      "step": 153300
+    },
+    {
+      "epoch": 0.020652046506176077,
+      "grad_norm": 3.1858959197998047,
+      "learning_rate": 3.125439956254907e-05,
+      "loss": 3.5986,
+      "step": 153400
+    },
+    {
+      "epoch": 0.020931128215719,
+      "grad_norm": 3.2093448638916016,
+      "learning_rate": 3.123296068980764e-05,
+      "loss": 3.5987,
+      "step": 153500
+    },
+    {
+      "epoch": 0.02121020992526192,
+      "grad_norm": 3.049703598022461,
+      "learning_rate": 3.1211516928421526e-05,
+      "loss": 3.5995,
+      "step": 153600
+    },
+    {
+      "epoch": 0.021489291634804838,
+      "grad_norm": 3.1410481929779053,
+      "learning_rate": 3.119006829520953e-05,
+      "loss": 3.586,
+      "step": 153700
+    },
+    {
+      "epoch": 0.02176837334434776,
+      "grad_norm": 2.9701108932495117,
+      "learning_rate": 3.1168614806994286e-05,
+      "loss": 3.5826,
+      "step": 153800
+    },
+    {
+      "epoch": 0.022047455053890677,
+      "grad_norm": 3.016268253326416,
+      "learning_rate": 3.114715648060221e-05,
+      "loss": 3.5746,
+      "step": 153900
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "grad_norm": 3.112840175628662,
+      "learning_rate": 3.1125693332863545e-05,
+      "loss": 3.5908,
+      "step": 154000
+    },
+    {
+      "epoch": 0.0223265367634336,
+      "eval_loss": 2.1794586181640625,
+      "eval_runtime": 52.4216,
+      "eval_samples_per_second": 194.462,
+      "eval_steps_per_second": 1.526,
+      "step": 154000
+    },
+    {
+      "epoch": 0.022605618472976517,
+      "grad_norm": 3.132110595703125,
+      "learning_rate": 3.110422538061228e-05,
+      "loss": 3.57,
+      "step": 154100
+    },
+    {
+      "epoch": 0.022884700182519438,
+      "grad_norm": 3.2359519004821777,
+      "learning_rate": 3.108275264068619e-05,
+      "loss": 3.6035,
+      "step": 154200
+    },
+    {
+      "epoch": 0.02316378189206236,
+      "grad_norm": 2.9039528369903564,
+      "learning_rate": 3.1061275129926816e-05,
+      "loss": 3.5772,
+      "step": 154300
+    },
+    {
+      "epoch": 0.023442863601605277,
+      "grad_norm": 3.10616397857666,
+      "learning_rate": 3.103979286517943e-05,
+      "loss": 3.58,
+      "step": 154400
+    },
+    {
+      "epoch": 0.0237219453111482,
+      "grad_norm": 3.2507059574127197,
+      "learning_rate": 3.101830586329302e-05,
+      "loss": 3.5788,
+      "step": 154500
+    },
+    {
+      "epoch": 0.024001027020691117,
+      "grad_norm": 3.145289421081543,
+      "learning_rate": 3.099681414112032e-05,
+      "loss": 3.5909,
+      "step": 154600
+    },
+    {
+      "epoch": 0.024280108730234038,
+      "grad_norm": 3.2056355476379395,
+      "learning_rate": 3.097531771551774e-05,
+      "loss": 3.5776,
+      "step": 154700
+    },
+    {
+      "epoch": 0.02455919043977696,
+      "grad_norm": 3.3703291416168213,
+      "learning_rate": 3.095381660334539e-05,
+      "loss": 3.5746,
+      "step": 154800
+    },
+    {
+      "epoch": 0.024838272149319877,
+      "grad_norm": 3.181138277053833,
+      "learning_rate": 3.0932310821467036e-05,
+      "loss": 3.5715,
+      "step": 154900
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "grad_norm": 3.000821590423584,
+      "learning_rate": 3.091080038675015e-05,
+      "loss": 3.5743,
+      "step": 155000
+    },
+    {
+      "epoch": 0.0251173538588628,
+      "eval_loss": 2.1706509590148926,
+      "eval_runtime": 52.3403,
+      "eval_samples_per_second": 194.764,
+      "eval_steps_per_second": 1.528,
+      "step": 155000
+    },
+    {
+      "epoch": 0.025396435568405717,
+      "grad_norm": 2.9740653038024902,
+      "learning_rate": 3.0889285316065806e-05,
+      "loss": 3.5711,
+      "step": 155100
+    },
+    {
+      "epoch": 0.025675517277948638,
+      "grad_norm": 3.1311419010162354,
+      "learning_rate": 3.0867765626288755e-05,
+      "loss": 3.5845,
+      "step": 155200
+    },
+    {
+      "epoch": 0.025954598987491556,
+      "grad_norm": 3.0719974040985107,
+      "learning_rate": 3.084624133429733e-05,
+      "loss": 3.5731,
+      "step": 155300
+    },
+    {
+      "epoch": 0.026233680697034478,
+      "grad_norm": 3.0461819171905518,
+      "learning_rate": 3.082471245697351e-05,
+      "loss": 3.5738,
+      "step": 155400
+    },
+    {
+      "epoch": 0.0265127624065774,
+      "grad_norm": 2.9734132289886475,
+      "learning_rate": 3.080317901120285e-05,
+      "loss": 3.5853,
+      "step": 155500
+    },
+    {
+      "epoch": 0.026791844116120317,
+      "grad_norm": 3.117506980895996,
+      "learning_rate": 3.078164101387449e-05,
+      "loss": 3.5847,
+      "step": 155600
+    },
+    {
+      "epoch": 0.02707092582566324,
+      "grad_norm": 3.141174554824829,
+      "learning_rate": 3.076009848188114e-05,
+      "loss": 3.5861,
+      "step": 155700
+    },
+    {
+      "epoch": 0.027350007535206156,
+      "grad_norm": 3.1444389820098877,
+      "learning_rate": 3.0738551432119086e-05,
+      "loss": 3.5716,
+      "step": 155800
+    },
+    {
+      "epoch": 0.027629089244749078,
+      "grad_norm": 3.2603206634521484,
+      "learning_rate": 3.0716999881488135e-05,
+      "loss": 3.5878,
+      "step": 155900
+    },
+    {
+      "epoch": 0.027908170954292,
+      "grad_norm": 3.119466781616211,
+      "learning_rate": 3.069544384689162e-05,
+      "loss": 3.5913,
+      "step": 156000
+    },
+    {
+      "epoch": 0.027908170954292,
+      "eval_loss": 2.1707427501678467,
+      "eval_runtime": 52.4036,
+      "eval_samples_per_second": 194.529,
+      "eval_steps_per_second": 1.527,
+      "step": 156000
+    },
+    {
+      "epoch": 0.028187252663834917,
+      "grad_norm": 3.3349738121032715,
+      "learning_rate": 3.06738833452364e-05,
+      "loss": 3.5642,
+      "step": 156100
+    },
+    {
+      "epoch": 0.02846633437337784,
+      "grad_norm": 3.151176691055298,
+      "learning_rate": 3.065231839343285e-05,
+      "loss": 3.5908,
+      "step": 156200
+    },
+    {
+      "epoch": 0.028745416082920756,
+      "grad_norm": 3.139475107192993,
+      "learning_rate": 3.0630749008394813e-05,
+      "loss": 3.5672,
+      "step": 156300
+    },
+    {
+      "epoch": 0.029024497792463678,
+      "grad_norm": 3.1396217346191406,
+      "learning_rate": 3.0609175207039636e-05,
+      "loss": 3.5787,
+      "step": 156400
+    },
+    {
+      "epoch": 0.0293035795020066,
+      "grad_norm": 2.9696104526519775,
+      "learning_rate": 3.05875970062881e-05,
+      "loss": 3.573,
+      "step": 156500
+    },
+    {
+      "epoch": 0.029582661211549517,
+      "grad_norm": 3.078080415725708,
+      "learning_rate": 3.056601442306445e-05,
+      "loss": 3.5583,
+      "step": 156600
+    },
+    {
+      "epoch": 0.02986174292109244,
+      "grad_norm": 3.1915009021759033,
+      "learning_rate": 3.054442747429638e-05,
+      "loss": 3.5809,
+      "step": 156700
+    },
+    {
+      "epoch": 0.030140824630635357,
+      "grad_norm": 3.194831132888794,
+      "learning_rate": 3.052283617691499e-05,
+      "loss": 3.5695,
+      "step": 156800
+    },
+    {
+      "epoch": 0.030419906340178278,
+      "grad_norm": 3.0851330757141113,
+      "learning_rate": 3.0501240547854793e-05,
+      "loss": 3.5686,
+      "step": 156900
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "grad_norm": 3.050123929977417,
+      "learning_rate": 3.047964060405371e-05,
+      "loss": 3.5643,
+      "step": 157000
+    },
+    {
+      "epoch": 0.030698988049721196,
+      "eval_loss": 2.165821075439453,
+      "eval_runtime": 52.6341,
+      "eval_samples_per_second": 193.677,
+      "eval_steps_per_second": 1.52,
+      "step": 157000
+    },
+    {
+      "epoch": 0.030978069759264117,
+      "grad_norm": 3.080662727355957,
+      "learning_rate": 3.0458036362453036e-05,
+      "loss": 3.5674,
+      "step": 157100
+    },
+    {
+      "epoch": 0.03125715146880704,
+      "grad_norm": 2.9070680141448975,
+      "learning_rate": 3.0436427839997444e-05,
+      "loss": 3.5709,
+      "step": 157200
+    },
+    {
+      "epoch": 0.03153623317834996,
+      "grad_norm": 3.212815046310425,
+      "learning_rate": 3.0414815053634966e-05,
+      "loss": 3.5596,
+      "step": 157300
+    },
+    {
+      "epoch": 0.031815314887892875,
+      "grad_norm": 2.9851808547973633,
+      "learning_rate": 3.039319802031696e-05,
+      "loss": 3.5877,
+      "step": 157400
+    },
+    {
+      "epoch": 0.0320943965974358,
+      "grad_norm": 3.1525375843048096,
+      "learning_rate": 3.037157675699814e-05,
+      "loss": 3.5742,
+      "step": 157500
+    },
+    {
+      "epoch": 0.03237347830697872,
+      "grad_norm": 3.253023386001587,
+      "learning_rate": 3.034995128063651e-05,
+      "loss": 3.5823,
+      "step": 157600
+    },
+    {
+      "epoch": 0.032652560016521635,
+      "grad_norm": 3.126237154006958,
+      "learning_rate": 3.0328321608193427e-05,
+      "loss": 3.5695,
+      "step": 157700
+    },
+    {
+      "epoch": 0.03293164172606456,
+      "grad_norm": 2.9912712574005127,
+      "learning_rate": 3.030668775663347e-05,
+      "loss": 3.5762,
+      "step": 157800
+    },
+    {
+      "epoch": 0.03321072343560748,
+      "grad_norm": 3.2612810134887695,
+      "learning_rate": 3.0285049742924564e-05,
+      "loss": 3.551,
+      "step": 157900
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "grad_norm": 3.226860761642456,
+      "learning_rate": 3.026340758403785e-05,
+      "loss": 3.5442,
+      "step": 158000
+    },
+    {
+      "epoch": 0.033489805145150396,
+      "eval_loss": 2.1677935123443604,
+      "eval_runtime": 52.5523,
+      "eval_samples_per_second": 193.978,
+      "eval_steps_per_second": 1.522,
+      "step": 158000
+    },
+    {
+      "epoch": 0.033768886854693314,
+      "grad_norm": 3.011373281478882,
+      "learning_rate": 3.024176129694774e-05,
+      "loss": 3.5603,
+      "step": 158100
+    },
+    {
+      "epoch": 0.03404796856423624,
+      "grad_norm": 3.067375898361206,
+      "learning_rate": 3.022011089863187e-05,
+      "loss": 3.5734,
+      "step": 158200
+    },
+    {
+      "epoch": 0.03432705027377916,
+      "grad_norm": 3.1003239154815674,
+      "learning_rate": 3.0198456406071134e-05,
+      "loss": 3.5688,
+      "step": 158300
+    },
+    {
+      "epoch": 0.034606131983322075,
+      "grad_norm": 2.9454071521759033,
+      "learning_rate": 3.017679783624959e-05,
+      "loss": 3.5617,
+      "step": 158400
+    },
+    {
+      "epoch": 0.034885213692865,
+      "grad_norm": 3.2112362384796143,
+      "learning_rate": 3.015513520615455e-05,
+      "loss": 3.5651,
+      "step": 158500
+    },
+    {
+      "epoch": 0.03516429540240792,
+      "grad_norm": 3.0805153846740723,
+      "learning_rate": 3.0133468532776454e-05,
+      "loss": 3.555,
+      "step": 158600
+    },
+    {
+      "epoch": 0.035443377111950836,
+      "grad_norm": 3.0264370441436768,
+      "learning_rate": 3.011179783310894e-05,
+      "loss": 3.5537,
+      "step": 158700
+    },
+    {
+      "epoch": 0.035722458821493754,
+      "grad_norm": 3.0615999698638916,
+      "learning_rate": 3.0090123124148807e-05,
+      "loss": 3.5466,
+      "step": 158800
+    },
+    {
+      "epoch": 0.03600154053103668,
+      "grad_norm": 3.1815524101257324,
+      "learning_rate": 3.0068444422896004e-05,
+      "loss": 3.5535,
+      "step": 158900
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "grad_norm": 3.0562305450439453,
+      "learning_rate": 3.004676174635358e-05,
+      "loss": 3.5663,
+      "step": 159000
+    },
+    {
+      "epoch": 0.036280622240579596,
+      "eval_loss": 2.173013925552368,
+      "eval_runtime": 52.4542,
+      "eval_samples_per_second": 194.341,
+      "eval_steps_per_second": 1.525,
+      "step": 159000
+    },
+    {
+      "epoch": 0.036559703950122514,
+      "grad_norm": 3.14689302444458,
+      "learning_rate": 3.002507511152774e-05,
+      "loss": 3.5568,
+      "step": 159100
+    },
+    {
+      "epoch": 0.03683878565966544,
+      "grad_norm": 3.087399959564209,
+      "learning_rate": 3.0003384535427765e-05,
+      "loss": 3.557,
+      "step": 159200
+    },
+    {
+      "epoch": 0.03711786736920836,
+      "grad_norm": 3.04844331741333,
+      "learning_rate": 2.9981690035066057e-05,
+      "loss": 3.5409,
+      "step": 159300
+    },
+    {
+      "epoch": 0.037396949078751275,
+      "grad_norm": 3.2028706073760986,
+      "learning_rate": 2.995999162745805e-05,
+      "loss": 3.5761,
+      "step": 159400
+    },
+    {
+      "epoch": 0.0376760307882942,
+      "grad_norm": 2.9123711585998535,
+      "learning_rate": 2.99382893296223e-05,
+      "loss": 3.5473,
+      "step": 159500
+    },
+    {
+      "epoch": 0.03795511249783712,
+      "grad_norm": 3.165459156036377,
+      "learning_rate": 2.9916583158580357e-05,
+      "loss": 3.5596,
+      "step": 159600
+    },
+    {
+      "epoch": 0.038234194207380036,
+      "grad_norm": 3.1565003395080566,
+      "learning_rate": 2.989487313135686e-05,
+      "loss": 3.5577,
+      "step": 159700
+    },
+    {
+      "epoch": 0.038513275916922954,
+      "grad_norm": 3.1155638694763184,
+      "learning_rate": 2.9873159264979433e-05,
+      "loss": 3.5572,
+      "step": 159800
+    },
+    {
+      "epoch": 0.03879235762646588,
+      "grad_norm": 3.1283843517303467,
+      "learning_rate": 2.9851441576478734e-05,
+      "loss": 3.5478,
+      "step": 159900
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "grad_norm": 3.2107434272766113,
+      "learning_rate": 2.9829720082888406e-05,
+      "loss": 3.5637,
+      "step": 160000
+    },
+    {
+      "epoch": 0.0390714393360088,
+      "eval_loss": 2.1652143001556396,
+      "eval_runtime": 52.479,
+      "eval_samples_per_second": 194.249,
+      "eval_steps_per_second": 1.524,
+      "step": 160000
+    },
+    {
+      "epoch": 0.00027908170954291995,
+      "grad_norm": 1.561998724937439,
+      "learning_rate": 2.9807994801245094e-05,
+      "loss": 1.7734,
+      "step": 160100
+    },
+    {
+      "epoch": 0.0005581634190858399,
+      "grad_norm": 1.5832147598266602,
+      "learning_rate": 2.9786265748588383e-05,
+      "loss": 1.7793,
+      "step": 160200
+    },
+    {
+      "epoch": 0.0008372451286287599,
+      "grad_norm": 1.5801053047180176,
+      "learning_rate": 2.9764532941960848e-05,
+      "loss": 1.7738,
+      "step": 160300
+    },
+    {
+      "epoch": 0.0011163268381716798,
+      "grad_norm": 1.5710026025772095,
+      "learning_rate": 2.9742796398407996e-05,
+      "loss": 1.7729,
+      "step": 160400
+    },
+    {
+      "epoch": 0.0013954085477146,
+      "grad_norm": 1.5338846445083618,
+      "learning_rate": 2.9721056134978263e-05,
+      "loss": 1.7725,
+      "step": 160500
+    },
+    {
+      "epoch": 0.0016744902572575198,
+      "grad_norm": 1.5545586347579956,
+      "learning_rate": 2.9699312168722998e-05,
+      "loss": 1.7748,
+      "step": 160600
+    },
+    {
+      "epoch": 0.00195357196680044,
+      "grad_norm": 1.5022891759872437,
+      "learning_rate": 2.967756451669646e-05,
+      "loss": 1.7757,
+      "step": 160700
+    },
+    {
+      "epoch": 0.0022326536763433596,
+      "grad_norm": 1.5773268938064575,
+      "learning_rate": 2.9655813195955808e-05,
+      "loss": 1.7746,
+      "step": 160800
+    },
+    {
+      "epoch": 0.0025117353858862797,
+      "grad_norm": 1.5689061880111694,
+      "learning_rate": 2.9634058223561058e-05,
+      "loss": 1.7767,
+      "step": 160900
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "grad_norm": 1.523888349533081,
+      "learning_rate": 2.9612299616575108e-05,
+      "loss": 1.7725,
+      "step": 161000
+    },
+    {
+      "epoch": 0.0027908170954292,
+      "eval_loss": 2.154878616333008,
+      "eval_runtime": 52.3123,
+      "eval_samples_per_second": 194.868,
+      "eval_steps_per_second": 1.529,
+      "step": 161000
+    },
+    {
+      "epoch": 0.00306989880497212,
+      "grad_norm": 1.5541263818740845,
+      "learning_rate": 2.9590537392063693e-05,
+      "loss": 1.775,
+      "step": 161100
+    },
+    {
+      "epoch": 0.0033489805145150396,
+      "grad_norm": 1.542082667350769,
+      "learning_rate": 2.9568771567095403e-05,
+      "loss": 1.775,
+      "step": 161200
+    },
+    {
+      "epoch": 0.0036280622240579597,
+      "grad_norm": 1.6016403436660767,
+      "learning_rate": 2.9547002158741637e-05,
+      "loss": 1.7809,
+      "step": 161300
+    },
+    {
+      "epoch": 0.00390714393360088,
+      "grad_norm": 1.604466438293457,
+      "learning_rate": 2.952522918407661e-05,
+      "loss": 1.7691,
+      "step": 161400
+    },
+    {
+      "epoch": 0.0041862256431437995,
+      "grad_norm": 1.507333755493164,
+      "learning_rate": 2.950345266017732e-05,
+      "loss": 1.7706,
+      "step": 161500
+    },
+    {
+      "epoch": 0.004465307352686719,
+      "grad_norm": 1.6002788543701172,
+      "learning_rate": 2.948167260412358e-05,
+      "loss": 1.7947,
+      "step": 161600
+    },
+    {
+      "epoch": 0.00474438906222964,
+      "grad_norm": 1.5560261011123657,
+      "learning_rate": 2.9459889032997933e-05,
+      "loss": 1.7981,
+      "step": 161700
+    },
+    {
+      "epoch": 0.005023470771772559,
+      "grad_norm": 1.665317177772522,
+      "learning_rate": 2.9438101963885728e-05,
+      "loss": 1.7923,
+      "step": 161800
+    },
+    {
+      "epoch": 0.00530255248131548,
+      "grad_norm": 1.5762344598770142,
+      "learning_rate": 2.9416311413875008e-05,
+      "loss": 1.7968,
+      "step": 161900
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "grad_norm": 1.553109884262085,
+      "learning_rate": 2.9394517400056583e-05,
+      "loss": 1.7948,
+      "step": 162000
+    },
+    {
+      "epoch": 0.0055816341908584,
+      "eval_loss": 2.1695141792297363,
+      "eval_runtime": 51.7758,
+      "eval_samples_per_second": 196.887,
+      "eval_steps_per_second": 1.545,
+      "step": 162000
+    },
+    {
+      "epoch": 0.005860715900401319,
+      "grad_norm": 1.566721796989441,
+      "learning_rate": 2.937271993952395e-05,
+      "loss": 1.7959,
+      "step": 162100
+    },
+    {
+      "epoch": 0.00613979760994424,
+      "grad_norm": 1.5952585935592651,
+      "learning_rate": 2.9350919049373343e-05,
+      "loss": 1.7892,
+      "step": 162200
+    },
+    {
+      "epoch": 0.0064188793194871595,
+      "grad_norm": 1.5383343696594238,
+      "learning_rate": 2.932911474670365e-05,
+      "loss": 1.7918,
+      "step": 162300
+    },
+    {
+      "epoch": 0.006697961029030079,
+      "grad_norm": 1.5342581272125244,
+      "learning_rate": 2.9307307048616468e-05,
+      "loss": 1.7815,
+      "step": 162400
+    },
+    {
+      "epoch": 0.006977042738573,
+      "grad_norm": 1.533451795578003,
+      "learning_rate": 2.9285495972216027e-05,
+      "loss": 1.7834,
+      "step": 162500
+    },
+    {
+      "epoch": 0.0072561244481159195,
+      "grad_norm": 1.5433770418167114,
+      "learning_rate": 2.9263681534609233e-05,
+      "loss": 1.7886,
+      "step": 162600
+    },
+    {
+      "epoch": 0.007535206157658839,
+      "grad_norm": 1.5695182085037231,
+      "learning_rate": 2.924186375290562e-05,
+      "loss": 1.7934,
+      "step": 162700
+    },
+    {
+      "epoch": 0.00781428786720176,
+      "grad_norm": 1.5996042490005493,
+      "learning_rate": 2.922004264421733e-05,
+      "loss": 1.7896,
+      "step": 162800
+    },
+    {
+      "epoch": 0.00809336957674468,
+      "grad_norm": 1.4973070621490479,
+      "learning_rate": 2.919821822565913e-05,
+      "loss": 1.7862,
+      "step": 162900
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "grad_norm": 1.6223851442337036,
+      "learning_rate": 2.9176390514348384e-05,
+      "loss": 1.7797,
+      "step": 163000
+    },
+    {
+      "epoch": 0.008372451286287599,
+      "eval_loss": 2.1567208766937256,
+      "eval_runtime": 51.8158,
+      "eval_samples_per_second": 196.735,
+      "eval_steps_per_second": 1.544,
+      "step": 163000
+    },
+    {
+      "epoch": 0.008651532995830519,
+      "grad_norm": 1.5375715494155884,
+      "learning_rate": 2.915455952740503e-05,
+      "loss": 1.7847,
+      "step": 163100
+    },
+    {
+      "epoch": 0.008930614705373438,
+      "grad_norm": 1.4768236875534058,
+      "learning_rate": 2.9132725281951584e-05,
+      "loss": 1.7804,
+      "step": 163200
+    },
+    {
+      "epoch": 0.00920969641491636,
+      "grad_norm": 1.583068609237671,
+      "learning_rate": 2.9110887795113108e-05,
+      "loss": 1.785,
+      "step": 163300
+    },
+    {
+      "epoch": 0.00948877812445928,
+      "grad_norm": 1.5692275762557983,
+      "learning_rate": 2.9089047084017206e-05,
+      "loss": 1.7824,
+      "step": 163400
+    },
+    {
+      "epoch": 0.0097678598340022,
+      "grad_norm": 1.5128995180130005,
+      "learning_rate": 2.9067203165794028e-05,
+      "loss": 1.7888,
+      "step": 163500
+    },
+    {
+      "epoch": 0.010046941543545119,
+      "grad_norm": 1.5675851106643677,
+      "learning_rate": 2.904535605757622e-05,
+      "loss": 1.7761,
+      "step": 163600
+    },
+    {
+      "epoch": 0.010326023253088039,
+      "grad_norm": 1.5943450927734375,
+      "learning_rate": 2.902350577649894e-05,
+      "loss": 1.7837,
+      "step": 163700
+    },
+    {
+      "epoch": 0.01060510496263096,
+      "grad_norm": 1.4797823429107666,
+      "learning_rate": 2.9001652339699818e-05,
+      "loss": 1.7785,
+      "step": 163800
+    },
+    {
+      "epoch": 0.01088418667217388,
+      "grad_norm": 1.5837212800979614,
+      "learning_rate": 2.8979795764319007e-05,
+      "loss": 1.7769,
+      "step": 163900
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "grad_norm": 1.5809427499771118,
+      "learning_rate": 2.8957936067499054e-05,
+      "loss": 1.7876,
+      "step": 164000
+    },
+    {
+      "epoch": 0.0111632683817168,
+      "eval_loss": 2.157485246658325,
+      "eval_runtime": 51.8186,
+      "eval_samples_per_second": 196.725,
+      "eval_steps_per_second": 1.544,
+      "step": 164000
+    },
+    {
+      "epoch": 0.011442350091259719,
+      "grad_norm": 1.4939141273498535,
+      "learning_rate": 2.8936073266385e-05,
+      "loss": 1.7851,
+      "step": 164100
+    },
+    {
+      "epoch": 0.011721431800802639,
+      "grad_norm": 1.6713926792144775,
+      "learning_rate": 2.8914207378124304e-05,
+      "loss": 1.7852,
+      "step": 164200
+    },
+    {
+      "epoch": 0.012000513510345558,
+      "grad_norm": 1.526655673980713,
+      "learning_rate": 2.889233841986686e-05,
+      "loss": 1.7744,
+      "step": 164300
+    },
+    {
+      "epoch": 0.01227959521988848,
+      "grad_norm": 1.674926996231079,
+      "learning_rate": 2.8870466408764952e-05,
+      "loss": 1.7761,
+      "step": 164400
+    },
+    {
+      "epoch": 0.0125586769294314,
+      "grad_norm": 1.5404378175735474,
+      "learning_rate": 2.8848591361973278e-05,
+      "loss": 1.7889,
+      "step": 164500
+    },
+    {
+      "epoch": 0.012837758638974319,
+      "grad_norm": 1.5554423332214355,
+      "learning_rate": 2.88267132966489e-05,
+      "loss": 1.7876,
+      "step": 164600
+    },
+    {
+      "epoch": 0.013116840348517239,
+      "grad_norm": 1.595197319984436,
+      "learning_rate": 2.880483222995125e-05,
+      "loss": 1.7806,
+      "step": 164700
+    },
+    {
+      "epoch": 0.013395922058060158,
+      "grad_norm": 1.611559510231018,
+      "learning_rate": 2.8782948179042114e-05,
+      "loss": 1.7856,
+      "step": 164800
+    },
+    {
+      "epoch": 0.013675003767603078,
+      "grad_norm": 1.622501015663147,
+      "learning_rate": 2.876106116108564e-05,
+      "loss": 1.7838,
+      "step": 164900
+    },
+    {
+      "epoch": 0.013954085477146,
+      "grad_norm": 1.5229750871658325,
+      "learning_rate": 2.873917119324826e-05,
+      "loss": 1.7851,
+      "step": 165000
+    },
+    {
+      "epoch": 0.013954085477146,
+      "eval_loss": 2.1587064266204834,
+      "eval_runtime": 51.9184,
+      "eval_samples_per_second": 196.347,
+      "eval_steps_per_second": 1.541,
+      "step": 165000
+    },
+    {
+      "epoch": 0.01423316718668892,
+      "grad_norm": 1.5421415567398071,
+      "learning_rate": 2.8717278292698767e-05,
+      "loss": 1.7853,
+      "step": 165100
+    },
+    {
+      "epoch": 0.014512248896231839,
+      "grad_norm": 1.598402976989746,
+      "learning_rate": 2.8695382476608228e-05,
+      "loss": 1.7886,
+      "step": 165200
+    },
+    {
+      "epoch": 0.014791330605774759,
+      "grad_norm": 1.5251154899597168,
+      "learning_rate": 2.867348376215e-05,
+      "loss": 1.7885,
+      "step": 165300
+    },
+    {
+      "epoch": 0.015070412315317678,
+      "grad_norm": 1.554371953010559,
+      "learning_rate": 2.86515821664997e-05,
+      "loss": 1.7831,
+      "step": 165400
+    },
+    {
+      "epoch": 0.015349494024860598,
+      "grad_norm": 1.606136679649353,
+      "learning_rate": 2.8629677706835234e-05,
+      "loss": 1.7672,
+      "step": 165500
+    },
+    {
+      "epoch": 0.01562857573440352,
+      "grad_norm": 1.5520561933517456,
+      "learning_rate": 2.8607770400336738e-05,
+      "loss": 1.7775,
+      "step": 165600
+    },
+    {
+      "epoch": 0.015907657443946437,
+      "grad_norm": 1.5017564296722412,
+      "learning_rate": 2.8585860264186582e-05,
+      "loss": 1.7837,
+      "step": 165700
+    },
+    {
+      "epoch": 0.01618673915348936,
+      "grad_norm": 1.5462771654129028,
+      "learning_rate": 2.8563947315569346e-05,
+      "loss": 1.7757,
+      "step": 165800
+    },
+    {
+      "epoch": 0.01646582086303228,
+      "grad_norm": 1.519423484802246,
+      "learning_rate": 2.8542031571671833e-05,
+      "loss": 1.7737,
+      "step": 165900
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "grad_norm": 1.552426815032959,
+      "learning_rate": 2.852011304968304e-05,
+      "loss": 1.7845,
+      "step": 166000
+    },
+    {
+      "epoch": 0.016744902572575198,
+      "eval_loss": 2.1654627323150635,
+      "eval_runtime": 51.7515,
+      "eval_samples_per_second": 196.98,
+      "eval_steps_per_second": 1.546,
+      "step": 166000
+    },
+    {
+      "epoch": 0.01702398428211812,
+      "grad_norm": 1.6090401411056519,
+      "learning_rate": 2.849819176679412e-05,
+      "loss": 1.7792,
+      "step": 166100
+    },
+    {
+      "epoch": 0.017303065991661037,
+      "grad_norm": 1.4991530179977417,
+      "learning_rate": 2.8476267740198403e-05,
+      "loss": 1.7757,
+      "step": 166200
+    },
+    {
+      "epoch": 0.01758214770120396,
+      "grad_norm": 1.545792579650879,
+      "learning_rate": 2.8454340987091382e-05,
+      "loss": 1.7782,
+      "step": 166300
+    },
+    {
+      "epoch": 0.017861229410746877,
+      "grad_norm": 1.5758668184280396,
+      "learning_rate": 2.8432411524670675e-05,
+      "loss": 1.7627,
+      "step": 166400
+    },
+    {
+      "epoch": 0.018140311120289798,
+      "grad_norm": 1.5638821125030518,
+      "learning_rate": 2.8410479370136035e-05,
+      "loss": 1.7816,
+      "step": 166500
+    },
+    {
+      "epoch": 0.01841939282983272,
+      "grad_norm": 1.6477131843566895,
+      "learning_rate": 2.8388544540689314e-05,
+      "loss": 1.7814,
+      "step": 166600
+    },
+    {
+      "epoch": 0.018698474539375638,
+      "grad_norm": 1.5519869327545166,
+      "learning_rate": 2.836660705353447e-05,
+      "loss": 1.7747,
+      "step": 166700
+    },
+    {
+      "epoch": 0.01897755624891856,
+      "grad_norm": 1.5598399639129639,
+      "learning_rate": 2.8344666925877556e-05,
+      "loss": 1.7778,
+      "step": 166800
+    },
+    {
+      "epoch": 0.019256637958461477,
+      "grad_norm": 1.5361994504928589,
+      "learning_rate": 2.8322724174926664e-05,
+      "loss": 1.7796,
+      "step": 166900
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "grad_norm": 1.5680959224700928,
+      "learning_rate": 2.8300778817891976e-05,
+      "loss": 1.7742,
+      "step": 167000
+    },
+    {
+      "epoch": 0.0195357196680044,
+      "eval_loss": 2.1607890129089355,
+      "eval_runtime": 52.1406,
+      "eval_samples_per_second": 195.51,
+      "eval_steps_per_second": 1.534,
+      "step": 167000
     }
   ],
   "logging_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 1.4574492479127552e+19,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66d39cf86390d3a1f1bf05e9571d4d0939bf6f5fc60ae060f7397e9b450ea61c
-size 5713

 version https://git-lfs.github.com/spec/v1
+oid sha256:9318402efc23f8b2e09dec877ba7b88863d76a00aceeef7c22f944e9f6a43e28
+size 5777