Model save

Browse files

Files changed (6) hide show

README.md +57 -0
all_results.json +9 -0
generation_config.json +9 -0
runs/Dec26_12-08-20_mia1-gpu-110/events.out.tfevents.1735214952.mia1-gpu-110.3773944.0 +2 -2
train_results.json +9 -0
trainer_state.json +1592 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+base_model: meta-llama/Llama-3.1-8B
+library_name: transformers
+model_name: llama-3_1-8b-overfit-ua
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for llama-3_1-8b-overfit-ua
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="antonpolishko/llama-3_1-8b-overfit-ua", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/polyagent/huggingface/runs/e2est3jv)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.12.1
+- Transformers: 4.46.3
+- Pytorch: 2.6.0.dev20241113+rocm6.2
+- Datasets: 3.1.0
+- Tokenizers: 0.20.3
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 6.427401199279931e+18,
+    "train_loss": 1.711617823146263,
+    "train_runtime": 5145.0339,
+    "train_samples": 95663,
+    "train_samples_per_second": 13.545,
+    "train_steps_per_second": 0.212
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.46.3"
+}

runs/Dec26_12-08-20_mia1-gpu-110/events.out.tfevents.1735214952.mia1-gpu-110.3773944.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a9d2c1f73e98bf878dd4f8cd6855cb864dea5e95da7596a422dad90da6a2554
-size 52652

 version https://git-lfs.github.com/spec/v1
+oid sha256:f07e9c40b0852d6075483db246b6a6194366aa39fd072c30691d0ce0de6cdd45
+size 53006

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 3.0,
+    "total_flos": 6.427401199279931e+18,
+    "train_loss": 1.711617823146263,
+    "train_runtime": 5145.0339,
+    "train_samples": 95663,
+    "train_samples_per_second": 13.545,
+    "train_steps_per_second": 0.212
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1592 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 300,
+  "global_step": 1089,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0027548209366391185,
+      "grad_norm": 10.8125,
+      "learning_rate": 1e-06,
+      "loss": 1.9168,
+      "step": 1
+    },
+    {
+      "epoch": 0.013774104683195593,
+      "grad_norm": 8.4375,
+      "learning_rate": 1.999962411893365e-06,
+      "loss": 1.9099,
+      "step": 5
+    },
+    {
+      "epoch": 0.027548209366391185,
+      "grad_norm": 4.90625,
+      "learning_rate": 1.9997327170302815e-06,
+      "loss": 1.8629,
+      "step": 10
+    },
+    {
+      "epoch": 0.04132231404958678,
+      "grad_norm": 3.671875,
+      "learning_rate": 1.99929425749243e-06,
+      "loss": 1.8376,
+      "step": 15
+    },
+    {
+      "epoch": 0.05509641873278237,
+      "grad_norm": 2.828125,
+      "learning_rate": 1.998647124839145e-06,
+      "loss": 1.8137,
+      "step": 20
+    },
+    {
+      "epoch": 0.06887052341597796,
+      "grad_norm": 2.640625,
+      "learning_rate": 1.997791454204984e-06,
+      "loss": 1.8055,
+      "step": 25
+    },
+    {
+      "epoch": 0.08264462809917356,
+      "grad_norm": 2.234375,
+      "learning_rate": 1.9967274242715065e-06,
+      "loss": 1.7913,
+      "step": 30
+    },
+    {
+      "epoch": 0.09641873278236915,
+      "grad_norm": 2.21875,
+      "learning_rate": 1.995455257229964e-06,
+      "loss": 1.7897,
+      "step": 35
+    },
+    {
+      "epoch": 0.11019283746556474,
+      "grad_norm": 1.8828125,
+      "learning_rate": 1.9939752187349e-06,
+      "loss": 1.7752,
+      "step": 40
+    },
+    {
+      "epoch": 0.12396694214876033,
+      "grad_norm": 1.8671875,
+      "learning_rate": 1.9922876178486764e-06,
+      "loss": 1.7802,
+      "step": 45
+    },
+    {
+      "epoch": 0.13774104683195593,
+      "grad_norm": 1.65625,
+      "learning_rate": 1.9903928069769356e-06,
+      "loss": 1.7686,
+      "step": 50
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 1.765625,
+      "learning_rate": 1.9882911817950105e-06,
+      "loss": 1.7702,
+      "step": 55
+    },
+    {
+      "epoch": 0.1652892561983471,
+      "grad_norm": 1.7265625,
+      "learning_rate": 1.985983181165299e-06,
+      "loss": 1.7618,
+      "step": 60
+    },
+    {
+      "epoch": 0.1790633608815427,
+      "grad_norm": 1.609375,
+      "learning_rate": 1.98346928704562e-06,
+      "loss": 1.7627,
+      "step": 65
+    },
+    {
+      "epoch": 0.1928374655647383,
+      "grad_norm": 1.5625,
+      "learning_rate": 1.9807500243885744e-06,
+      "loss": 1.7665,
+      "step": 70
+    },
+    {
+      "epoch": 0.2066115702479339,
+      "grad_norm": 1.515625,
+      "learning_rate": 1.9778259610319187e-06,
+      "loss": 1.755,
+      "step": 75
+    },
+    {
+      "epoch": 0.22038567493112948,
+      "grad_norm": 1.5234375,
+      "learning_rate": 1.9746977075799933e-06,
+      "loss": 1.7574,
+      "step": 80
+    },
+    {
+      "epoch": 0.23415977961432508,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.9713659172762126e-06,
+      "loss": 1.7529,
+      "step": 85
+    },
+    {
+      "epoch": 0.24793388429752067,
+      "grad_norm": 1.453125,
+      "learning_rate": 1.9678312858666578e-06,
+      "loss": 1.7417,
+      "step": 90
+    },
+    {
+      "epoch": 0.26170798898071623,
+      "grad_norm": 1.421875,
+      "learning_rate": 1.964094551454788e-06,
+      "loss": 1.7509,
+      "step": 95
+    },
+    {
+      "epoch": 0.27548209366391185,
+      "grad_norm": 1.5,
+      "learning_rate": 1.960156494347309e-06,
+      "loss": 1.7486,
+      "step": 100
+    },
+    {
+      "epoch": 0.2892561983471074,
+      "grad_norm": 1.3984375,
+      "learning_rate": 1.9560179368912327e-06,
+      "loss": 1.7531,
+      "step": 105
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.95167974330215e-06,
+      "loss": 1.7435,
+      "step": 110
+    },
+    {
+      "epoch": 0.3168044077134986,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.9471428194837667e-06,
+      "loss": 1.7403,
+      "step": 115
+    },
+    {
+      "epoch": 0.3305785123966942,
+      "grad_norm": 1.3984375,
+      "learning_rate": 1.9424081128387337e-06,
+      "loss": 1.7435,
+      "step": 120
+    },
+    {
+      "epoch": 0.3443526170798898,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.9374766120708077e-06,
+      "loss": 1.75,
+      "step": 125
+    },
+    {
+      "epoch": 0.3581267217630854,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.932349346978389e-06,
+      "loss": 1.7469,
+      "step": 130
+    },
+    {
+      "epoch": 0.371900826446281,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.927027388239482e-06,
+      "loss": 1.734,
+      "step": 135
+    },
+    {
+      "epoch": 0.3856749311294766,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.921511847188112e-06,
+      "loss": 1.7385,
+      "step": 140
+    },
+    {
+      "epoch": 0.39944903581267216,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.915803875582259e-06,
+      "loss": 1.736,
+      "step": 145
+    },
+    {
+      "epoch": 0.4132231404958678,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.9099046653633437e-06,
+      "loss": 1.7336,
+      "step": 150
+    },
+    {
+      "epoch": 0.42699724517906334,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.9038154484073284e-06,
+      "loss": 1.7324,
+      "step": 155
+    },
+    {
+      "epoch": 0.44077134986225897,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.8975374962674753e-06,
+      "loss": 1.7406,
+      "step": 160
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.8910721199088195e-06,
+      "loss": 1.7461,
+      "step": 165
+    },
+    {
+      "epoch": 0.46831955922865015,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.8844206694344138e-06,
+      "loss": 1.7296,
+      "step": 170
+    },
+    {
+      "epoch": 0.4820936639118457,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.877584533803398e-06,
+      "loss": 1.7286,
+      "step": 175
+    },
+    {
+      "epoch": 0.49586776859504134,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.8705651405409566e-06,
+      "loss": 1.7225,
+      "step": 180
+    },
+    {
+      "epoch": 0.509641873278237,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.8633639554402234e-06,
+      "loss": 1.7366,
+      "step": 185
+    },
+    {
+      "epoch": 0.5234159779614325,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.8559824822561913e-06,
+      "loss": 1.7353,
+      "step": 190
+    },
+    {
+      "epoch": 0.5371900826446281,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.8484222623917e-06,
+      "loss": 1.7223,
+      "step": 195
+    },
+    {
+      "epoch": 0.5509641873278237,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.8406848745755578e-06,
+      "loss": 1.7256,
+      "step": 200
+    },
+    {
+      "epoch": 0.5647382920110193,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.832771934532872e-06,
+      "loss": 1.7288,
+      "step": 205
+    },
+    {
+      "epoch": 0.5785123966942148,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.8246850946476505e-06,
+      "loss": 1.7247,
+      "step": 210
+    },
+    {
+      "epoch": 0.5922865013774105,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.8164260436177524e-06,
+      "loss": 1.7221,
+      "step": 215
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.8079965061022518e-06,
+      "loss": 1.7224,
+      "step": 220
+    },
+    {
+      "epoch": 0.6198347107438017,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.7993982423612941e-06,
+      "loss": 1.7206,
+      "step": 225
+    },
+    {
+      "epoch": 0.6336088154269972,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.7906330478885174e-06,
+      "loss": 1.7252,
+      "step": 230
+    },
+    {
+      "epoch": 0.6473829201101928,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.7817027530361174e-06,
+      "loss": 1.725,
+      "step": 235
+    },
+    {
+      "epoch": 0.6611570247933884,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.7726092226326315e-06,
+      "loss": 1.7189,
+      "step": 240
+    },
+    {
+      "epoch": 0.6749311294765841,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.7633543555935245e-06,
+      "loss": 1.719,
+      "step": 245
+    },
+    {
+      "epoch": 0.6887052341597796,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.7539400845246564e-06,
+      "loss": 1.7121,
+      "step": 250
+    },
+    {
+      "epoch": 0.7024793388429752,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.744368375318715e-06,
+      "loss": 1.7198,
+      "step": 255
+    },
+    {
+      "epoch": 0.7162534435261708,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.7346412267446958e-06,
+      "loss": 1.7149,
+      "step": 260
+    },
+    {
+      "epoch": 0.7300275482093664,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.724760670030521e-06,
+      "loss": 1.713,
+      "step": 265
+    },
+    {
+      "epoch": 0.743801652892562,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.7147287684388738e-06,
+      "loss": 1.7192,
+      "step": 270
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.7045476168363498e-06,
+      "loss": 1.721,
+      "step": 275
+    },
+    {
+      "epoch": 0.7713498622589532,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.6942193412560043e-06,
+      "loss": 1.7102,
+      "step": 280
+    },
+    {
+      "epoch": 0.7851239669421488,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.6837460984533934e-06,
+      "loss": 1.7218,
+      "step": 285
+    },
+    {
+      "epoch": 0.7988980716253443,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.6731300754562008e-06,
+      "loss": 1.7107,
+      "step": 290
+    },
+    {
+      "epoch": 0.8126721763085399,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.6623734891075385e-06,
+      "loss": 1.7138,
+      "step": 295
+    },
+    {
+      "epoch": 0.8264462809917356,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.6514785856030272e-06,
+      "loss": 1.7112,
+      "step": 300
+    },
+    {
+      "epoch": 0.8264462809917356,
+      "eval_loss": 1.7193405628204346,
+      "eval_runtime": 8.403,
+      "eval_samples_per_second": 83.779,
+      "eval_steps_per_second": 2.618,
+      "step": 300
+    },
+    {
+      "epoch": 0.8402203856749312,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.640447640021744e-06,
+      "loss": 1.7211,
+      "step": 305
+    },
+    {
+      "epoch": 0.8539944903581267,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.6292829558511376e-06,
+      "loss": 1.7142,
+      "step": 310
+    },
+    {
+      "epoch": 0.8677685950413223,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.6179868645060162e-06,
+      "loss": 1.7028,
+      "step": 315
+    },
+    {
+      "epoch": 0.8815426997245179,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.6065617248416967e-06,
+      "loss": 1.7091,
+      "step": 320
+    },
+    {
+      "epoch": 0.8953168044077136,
+      "grad_norm": 1.3125,
+      "learning_rate": 1.59500992266143e-06,
+      "loss": 1.7143,
+      "step": 325
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.5833338702181959e-06,
+      "loss": 1.7105,
+      "step": 330
+    },
+    {
+      "epoch": 0.9228650137741047,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.5715360057109744e-06,
+      "loss": 1.7146,
+      "step": 335
+    },
+    {
+      "epoch": 0.9366391184573003,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.5596187927755993e-06,
+      "loss": 1.7145,
+      "step": 340
+    },
+    {
+      "epoch": 0.9504132231404959,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.5475847199703033e-06,
+      "loss": 1.7099,
+      "step": 345
+    },
+    {
+      "epoch": 0.9641873278236914,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.535436300256053e-06,
+      "loss": 1.7143,
+      "step": 350
+    },
+    {
+      "epoch": 0.977961432506887,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.523176070471793e-06,
+      "loss": 1.7131,
+      "step": 355
+    },
+    {
+      "epoch": 0.9917355371900827,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.5108065908047014e-06,
+      "loss": 1.7248,
+      "step": 360
+    },
+    {
+      "epoch": 1.0055096418732783,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.4983304442555698e-06,
+      "loss": 1.7079,
+      "step": 365
+    },
+    {
+      "epoch": 1.019283746556474,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.4857502360994204e-06,
+      "loss": 1.712,
+      "step": 370
+    },
+    {
+      "epoch": 1.0330578512396693,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.4730685933414714e-06,
+      "loss": 1.703,
+      "step": 375
+    },
+    {
+      "epoch": 1.046831955922865,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.4602881641685643e-06,
+      "loss": 1.6944,
+      "step": 380
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 1.375,
+      "learning_rate": 1.4474116173961668e-06,
+      "loss": 1.7032,
+      "step": 385
+    },
+    {
+      "epoch": 1.0743801652892562,
+      "grad_norm": 1.375,
+      "learning_rate": 1.4344416419110728e-06,
+      "loss": 1.7122,
+      "step": 390
+    },
+    {
+      "epoch": 1.0881542699724518,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.4213809461099033e-06,
+      "loss": 1.703,
+      "step": 395
+    },
+    {
+      "epoch": 1.1019283746556474,
+      "grad_norm": 1.3515625,
+      "learning_rate": 1.4082322573335422e-06,
+      "loss": 1.703,
+      "step": 400
+    },
+    {
+      "epoch": 1.115702479338843,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.394998321297608e-06,
+      "loss": 1.7024,
+      "step": 405
+    },
+    {
+      "epoch": 1.1294765840220387,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.3816819015190943e-06,
+      "loss": 1.7058,
+      "step": 410
+    },
+    {
+      "epoch": 1.1432506887052343,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.3682857787392905e-06,
+      "loss": 1.6987,
+      "step": 415
+    },
+    {
+      "epoch": 1.1570247933884297,
+      "grad_norm": 1.375,
+      "learning_rate": 1.3548127503431038e-06,
+      "loss": 1.7029,
+      "step": 420
+    },
+    {
+      "epoch": 1.1707988980716253,
+      "grad_norm": 1.375,
+      "learning_rate": 1.3412656297749135e-06,
+      "loss": 1.6998,
+      "step": 425
+    },
+    {
+      "epoch": 1.184573002754821,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.327647245951058e-06,
+      "loss": 1.7051,
+      "step": 430
+    },
+    {
+      "epoch": 1.1983471074380165,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.3139604426691072e-06,
+      "loss": 1.7065,
+      "step": 435
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.300208078014014e-06,
+      "loss": 1.7019,
+      "step": 440
+    },
+    {
+      "epoch": 1.2258953168044078,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.2863930237612896e-06,
+      "loss": 1.6999,
+      "step": 445
+    },
+    {
+      "epoch": 1.2396694214876034,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.2725181647773174e-06,
+      "loss": 1.7046,
+      "step": 450
+    },
+    {
+      "epoch": 1.2534435261707988,
+      "grad_norm": 1.390625,
+      "learning_rate": 1.2585863984169343e-06,
+      "loss": 1.7069,
+      "step": 455
+    },
+    {
+      "epoch": 1.2672176308539944,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.2446006339184035e-06,
+      "loss": 1.7025,
+      "step": 460
+    },
+    {
+      "epoch": 1.28099173553719,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.2305637917959058e-06,
+      "loss": 1.7016,
+      "step": 465
+    },
+    {
+      "epoch": 1.2947658402203857,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.2164788032296755e-06,
+      "loss": 1.6997,
+      "step": 470
+    },
+    {
+      "epoch": 1.3085399449035813,
+      "grad_norm": 1.328125,
+      "learning_rate": 1.2023486094539124e-06,
+      "loss": 1.7003,
+      "step": 475
+    },
+    {
+      "epoch": 1.322314049586777,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.1881761611425888e-06,
+      "loss": 1.6885,
+      "step": 480
+    },
+    {
+      "epoch": 1.3360881542699725,
+      "grad_norm": 1.375,
+      "learning_rate": 1.1739644177932907e-06,
+      "loss": 1.7,
+      "step": 485
+    },
+    {
+      "epoch": 1.3498622589531681,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.159716347109213e-06,
+      "loss": 1.6989,
+      "step": 490
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.1454349243794419e-06,
+      "loss": 1.7009,
+      "step": 495
+    },
+    {
+      "epoch": 1.3774104683195592,
+      "grad_norm": 1.359375,
+      "learning_rate": 1.1311231318576545e-06,
+      "loss": 1.7007,
+      "step": 500
+    },
+    {
+      "epoch": 1.3911845730027548,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.1167839581393628e-06,
+      "loss": 1.6918,
+      "step": 505
+    },
+    {
+      "epoch": 1.4049586776859504,
+      "grad_norm": 1.296875,
+      "learning_rate": 1.1024203975378335e-06,
+      "loss": 1.6975,
+      "step": 510
+    },
+    {
+      "epoch": 1.418732782369146,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.0880354494588138e-06,
+      "loss": 1.6998,
+      "step": 515
+    },
+    {
+      "epoch": 1.4325068870523416,
+      "grad_norm": 1.375,
+      "learning_rate": 1.073632117774195e-06,
+      "loss": 1.6973,
+      "step": 520
+    },
+    {
+      "epoch": 1.4462809917355373,
+      "grad_norm": 1.296875,
+      "learning_rate": 1.0592134101947417e-06,
+      "loss": 1.7023,
+      "step": 525
+    },
+    {
+      "epoch": 1.4600550964187327,
+      "grad_norm": 1.3125,
+      "learning_rate": 1.0447823376420206e-06,
+      "loss": 1.707,
+      "step": 530
+    },
+    {
+      "epoch": 1.4738292011019283,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.0303419136196575e-06,
+      "loss": 1.6916,
+      "step": 535
+    },
+    {
+      "epoch": 1.487603305785124,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.0158951535840576e-06,
+      "loss": 1.6977,
+      "step": 540
+    },
+    {
+      "epoch": 1.5013774104683195,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.0014450743147145e-06,
+      "loss": 1.699,
+      "step": 545
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 1.3046875,
+      "learning_rate": 9.869946932842466e-07,
+      "loss": 1.6907,
+      "step": 550
+    },
+    {
+      "epoch": 1.5289256198347108,
+      "grad_norm": 1.3046875,
+      "learning_rate": 9.725470280282855e-07,
+      "loss": 1.7001,
+      "step": 555
+    },
+    {
+      "epoch": 1.5426997245179064,
+      "grad_norm": 1.3125,
+      "learning_rate": 9.581050955153545e-07,
+      "loss": 1.7054,
+      "step": 560
+    },
+    {
+      "epoch": 1.556473829201102,
+      "grad_norm": 1.296875,
+      "learning_rate": 9.43671911516861e-07,
+      "loss": 1.687,
+      "step": 565
+    },
+    {
+      "epoch": 1.5702479338842976,
+      "grad_norm": 1.3203125,
+      "learning_rate": 9.292504899773453e-07,
+      "loss": 1.7055,
+      "step": 570
+    },
+    {
+      "epoch": 1.5840220385674932,
+      "grad_norm": 1.28125,
+      "learning_rate": 9.148438423851041e-07,
+      "loss": 1.6975,
+      "step": 575
+    },
+    {
+      "epoch": 1.5977961432506889,
+      "grad_norm": 1.2890625,
+      "learning_rate": 9.00454977143331e-07,
+      "loss": 1.6997,
+      "step": 580
+    },
+    {
+      "epoch": 1.6115702479338843,
+      "grad_norm": 1.296875,
+      "learning_rate": 8.860868989419017e-07,
+      "loss": 1.6983,
+      "step": 585
+    },
+    {
+      "epoch": 1.6253443526170799,
+      "grad_norm": 1.3515625,
+      "learning_rate": 8.717426081299308e-07,
+      "loss": 1.6995,
+      "step": 590
+    },
+    {
+      "epoch": 1.6391184573002755,
+      "grad_norm": 1.296875,
+      "learning_rate": 8.574251000892386e-07,
+      "loss": 1.6948,
+      "step": 595
+    },
+    {
+      "epoch": 1.6528925619834711,
+      "grad_norm": 1.3515625,
+      "learning_rate": 8.431373646088549e-07,
+      "loss": 1.6961,
+      "step": 600
+    },
+    {
+      "epoch": 1.6528925619834711,
+      "eval_loss": 1.7039618492126465,
+      "eval_runtime": 8.3736,
+      "eval_samples_per_second": 84.073,
+      "eval_steps_per_second": 2.627,
+      "step": 600
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 1.2890625,
+      "learning_rate": 8.288823852606882e-07,
+      "loss": 1.6915,
+      "step": 605
+    },
+    {
+      "epoch": 1.6804407713498621,
+      "grad_norm": 1.296875,
+      "learning_rate": 8.14663138776496e-07,
+      "loss": 1.7006,
+      "step": 610
+    },
+    {
+      "epoch": 1.6942148760330578,
+      "grad_norm": 1.2890625,
+      "learning_rate": 8.004825944262805e-07,
+      "loss": 1.7029,
+      "step": 615
+    },
+    {
+      "epoch": 1.7079889807162534,
+      "grad_norm": 1.375,
+      "learning_rate": 7.863437133982471e-07,
+      "loss": 1.6942,
+      "step": 620
+    },
+    {
+      "epoch": 1.721763085399449,
+      "grad_norm": 1.3125,
+      "learning_rate": 7.722494481804445e-07,
+      "loss": 1.6927,
+      "step": 625
+    },
+    {
+      "epoch": 1.7355371900826446,
+      "grad_norm": 1.3046875,
+      "learning_rate": 7.582027419442268e-07,
+      "loss": 1.6929,
+      "step": 630
+    },
+    {
+      "epoch": 1.7493112947658402,
+      "grad_norm": 1.28125,
+      "learning_rate": 7.442065279296578e-07,
+      "loss": 1.702,
+      "step": 635
+    },
+    {
+      "epoch": 1.7630853994490359,
+      "grad_norm": 1.28125,
+      "learning_rate": 7.302637288329915e-07,
+      "loss": 1.6971,
+      "step": 640
+    },
+    {
+      "epoch": 1.7768595041322315,
+      "grad_norm": 1.2734375,
+      "learning_rate": 7.163772561963519e-07,
+      "loss": 1.6923,
+      "step": 645
+    },
+    {
+      "epoch": 1.790633608815427,
+      "grad_norm": 1.28125,
+      "learning_rate": 7.02550009799745e-07,
+      "loss": 1.6883,
+      "step": 650
+    },
+    {
+      "epoch": 1.8044077134986227,
+      "grad_norm": 1.3046875,
+      "learning_rate": 6.887848770555234e-07,
+      "loss": 1.6991,
+      "step": 655
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 1.296875,
+      "learning_rate": 6.750847324054374e-07,
+      "loss": 1.7069,
+      "step": 660
+    },
+    {
+      "epoch": 1.8319559228650137,
+      "grad_norm": 1.2890625,
+      "learning_rate": 6.614524367203906e-07,
+      "loss": 1.6979,
+      "step": 665
+    },
+    {
+      "epoch": 1.8457300275482094,
+      "grad_norm": 1.3046875,
+      "learning_rate": 6.478908367030338e-07,
+      "loss": 1.6829,
+      "step": 670
+    },
+    {
+      "epoch": 1.859504132231405,
+      "grad_norm": 1.3125,
+      "learning_rate": 6.344027642933128e-07,
+      "loss": 1.692,
+      "step": 675
+    },
+    {
+      "epoch": 1.8732782369146006,
+      "grad_norm": 1.28125,
+      "learning_rate": 6.209910360771033e-07,
+      "loss": 1.703,
+      "step": 680
+    },
+    {
+      "epoch": 1.887052341597796,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.076584526980484e-07,
+      "loss": 1.706,
+      "step": 685
+    },
+    {
+      "epoch": 1.9008264462809916,
+      "grad_norm": 1.2890625,
+      "learning_rate": 5.944077982727285e-07,
+      "loss": 1.6906,
+      "step": 690
+    },
+    {
+      "epoch": 1.9146005509641872,
+      "grad_norm": 1.2734375,
+      "learning_rate": 5.812418398092787e-07,
+      "loss": 1.6982,
+      "step": 695
+    },
+    {
+      "epoch": 1.9283746556473829,
+      "grad_norm": 1.3359375,
+      "learning_rate": 5.681633266295834e-07,
+      "loss": 1.6917,
+      "step": 700
+    },
+    {
+      "epoch": 1.9421487603305785,
+      "grad_norm": 1.28125,
+      "learning_rate": 5.551749897951582e-07,
+      "loss": 1.7014,
+      "step": 705
+    },
+    {
+      "epoch": 1.955922865013774,
+      "grad_norm": 1.2734375,
+      "learning_rate": 5.422795415368518e-07,
+      "loss": 1.6861,
+      "step": 710
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 1.3046875,
+      "learning_rate": 5.294796746884745e-07,
+      "loss": 1.6953,
+      "step": 715
+    },
+    {
+      "epoch": 1.9834710743801653,
+      "grad_norm": 1.3359375,
+      "learning_rate": 5.167780621244801e-07,
+      "loss": 1.6973,
+      "step": 720
+    },
+    {
+      "epoch": 1.997245179063361,
+      "grad_norm": 1.2890625,
+      "learning_rate": 5.041773562018135e-07,
+      "loss": 1.7019,
+      "step": 725
+    },
+    {
+      "epoch": 2.0110192837465566,
+      "grad_norm": 1.265625,
+      "learning_rate": 4.91680188206047e-07,
+      "loss": 1.7011,
+      "step": 730
+    },
+    {
+      "epoch": 2.024793388429752,
+      "grad_norm": 1.265625,
+      "learning_rate": 4.792891678019115e-07,
+      "loss": 1.7013,
+      "step": 735
+    },
+    {
+      "epoch": 2.038567493112948,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.6700688248834664e-07,
+      "loss": 1.6895,
+      "step": 740
+    },
+    {
+      "epoch": 2.0523415977961434,
+      "grad_norm": 1.3125,
+      "learning_rate": 4.548358970581757e-07,
+      "loss": 1.7029,
+      "step": 745
+    },
+    {
+      "epoch": 2.0661157024793386,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.427787530625278e-07,
+      "loss": 1.6931,
+      "step": 750
+    },
+    {
+      "epoch": 2.0798898071625342,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.3083796828010675e-07,
+      "loss": 1.6886,
+      "step": 755
+    },
+    {
+      "epoch": 2.09366391184573,
+      "grad_norm": 1.3046875,
+      "learning_rate": 4.190160361914292e-07,
+      "loss": 1.6907,
+      "step": 760
+    },
+    {
+      "epoch": 2.1074380165289255,
+      "grad_norm": 1.296875,
+      "learning_rate": 4.07315425458134e-07,
+      "loss": 1.6924,
+      "step": 765
+    },
+    {
+      "epoch": 2.121212121212121,
+      "grad_norm": 1.28125,
+      "learning_rate": 3.9573857940747537e-07,
+      "loss": 1.7019,
+      "step": 770
+    },
+    {
+      "epoch": 2.1349862258953167,
+      "grad_norm": 1.28125,
+      "learning_rate": 3.8428791552210594e-07,
+      "loss": 1.6975,
+      "step": 775
+    },
+    {
+      "epoch": 2.1487603305785123,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.729658249352563e-07,
+      "loss": 1.6986,
+      "step": 780
+    },
+    {
+      "epoch": 2.162534435261708,
+      "grad_norm": 1.3125,
+      "learning_rate": 3.6177467193141886e-07,
+      "loss": 1.6893,
+      "step": 785
+    },
+    {
+      "epoch": 2.1763085399449036,
+      "grad_norm": 1.2734375,
+      "learning_rate": 3.5071679345263537e-07,
+      "loss": 1.6833,
+      "step": 790
+    },
+    {
+      "epoch": 2.190082644628099,
+      "grad_norm": 1.2734375,
+      "learning_rate": 3.397944986104968e-07,
+      "loss": 1.693,
+      "step": 795
+    },
+    {
+      "epoch": 2.203856749311295,
+      "grad_norm": 1.3046875,
+      "learning_rate": 3.290100682039516e-07,
+      "loss": 1.6978,
+      "step": 800
+    },
+    {
+      "epoch": 2.2176308539944904,
+      "grad_norm": 1.28125,
+      "learning_rate": 3.1836575424303034e-07,
+      "loss": 1.7019,
+      "step": 805
+    },
+    {
+      "epoch": 2.231404958677686,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.078637794785791e-07,
+      "loss": 1.6977,
+      "step": 810
+    },
+    {
+      "epoch": 2.2451790633608817,
+      "grad_norm": 1.28125,
+      "learning_rate": 2.9750633693810224e-07,
+      "loss": 1.6898,
+      "step": 815
+    },
+    {
+      "epoch": 2.2589531680440773,
+      "grad_norm": 1.28125,
+      "learning_rate": 2.872955894678153e-07,
+      "loss": 1.6915,
+      "step": 820
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.7723366928099754e-07,
+      "loss": 1.6922,
+      "step": 825
+    },
+    {
+      "epoch": 2.2865013774104685,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.673226775127422e-07,
+      "loss": 1.6922,
+      "step": 830
+    },
+    {
+      "epoch": 2.3002754820936637,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.5756468378119533e-07,
+      "loss": 1.6873,
+      "step": 835
+    },
+    {
+      "epoch": 2.3140495867768593,
+      "grad_norm": 1.265625,
+      "learning_rate": 2.4796172575537934e-07,
+      "loss": 1.7068,
+      "step": 840
+    },
+    {
+      "epoch": 2.327823691460055,
+      "grad_norm": 1.3203125,
+      "learning_rate": 2.3851580872968435e-07,
+      "loss": 1.6993,
+      "step": 845
+    },
+    {
+      "epoch": 2.3415977961432506,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.292289052051224e-07,
+      "loss": 1.6992,
+      "step": 850
+    },
+    {
+      "epoch": 2.355371900826446,
+      "grad_norm": 1.2578125,
+      "learning_rate": 2.2010295447742743e-07,
+      "loss": 1.6891,
+      "step": 855
+    },
+    {
+      "epoch": 2.369146005509642,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.111398622320927e-07,
+      "loss": 1.6968,
+      "step": 860
+    },
+    {
+      "epoch": 2.3829201101928374,
+      "grad_norm": 1.296875,
+      "learning_rate": 2.0234150014642305e-07,
+      "loss": 1.6946,
+      "step": 865
+    },
+    {
+      "epoch": 2.396694214876033,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.937097054986915e-07,
+      "loss": 1.6892,
+      "step": 870
+    },
+    {
+      "epoch": 2.4104683195592287,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.8524628078447602e-07,
+      "loss": 1.6915,
+      "step": 875
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.769529933402637e-07,
+      "loss": 1.6946,
+      "step": 880
+    },
+    {
+      "epoch": 2.43801652892562,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.6883157497439349e-07,
+      "loss": 1.6975,
+      "step": 885
+    },
+    {
+      "epoch": 2.4517906336088156,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.6088372160541962e-07,
+      "loss": 1.6871,
+      "step": 890
+    },
+    {
+      "epoch": 2.465564738292011,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.531110929079681e-07,
+      "loss": 1.6909,
+      "step": 895
+    },
+    {
+      "epoch": 2.479338842975207,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.4551531196616396e-07,
+      "loss": 1.6908,
+      "step": 900
+    },
+    {
+      "epoch": 2.479338842975207,
+      "eval_loss": 1.7025996446609497,
+      "eval_runtime": 8.3873,
+      "eval_samples_per_second": 83.936,
+      "eval_steps_per_second": 2.623,
+      "step": 900
+    },
+    {
+      "epoch": 2.4931129476584024,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.3809796493469728e-07,
+      "loss": 1.6981,
+      "step": 905
+    },
+    {
+      "epoch": 2.5068870523415976,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.3086060070760196e-07,
+      "loss": 1.6902,
+      "step": 910
+    },
+    {
+      "epoch": 2.5206611570247937,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.23804730594814e-07,
+      "loss": 1.6964,
+      "step": 915
+    },
+    {
+      "epoch": 2.534435261707989,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.1693182800658042e-07,
+      "loss": 1.6884,
+      "step": 920
+    },
+    {
+      "epoch": 2.5482093663911844,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.102433281457802e-07,
+      "loss": 1.6969,
+      "step": 925
+    },
+    {
+      "epoch": 2.56198347107438,
+      "grad_norm": 1.265625,
+      "learning_rate": 1.0374062770822411e-07,
+      "loss": 1.7003,
+      "step": 930
+    },
+    {
+      "epoch": 2.5757575757575757,
+      "grad_norm": 1.2578125,
+      "learning_rate": 9.742508459099707e-08,
+      "loss": 1.7095,
+      "step": 935
+    },
+    {
+      "epoch": 2.5895316804407713,
+      "grad_norm": 1.2890625,
+      "learning_rate": 9.129801760890076e-08,
+      "loss": 1.7026,
+      "step": 940
+    },
+    {
+      "epoch": 2.603305785123967,
+      "grad_norm": 1.2578125,
+      "learning_rate": 8.536070621905811e-08,
+      "loss": 1.6964,
+      "step": 945
+    },
+    {
+      "epoch": 2.6170798898071626,
+      "grad_norm": 1.3046875,
+      "learning_rate": 7.961439025373617e-08,
+      "loss": 1.6984,
+      "step": 950
+    },
+    {
+      "epoch": 2.630853994490358,
+      "grad_norm": 1.28125,
+      "learning_rate": 7.40602696614444e-08,
+      "loss": 1.7022,
+      "step": 955
+    },
+    {
+      "epoch": 2.644628099173554,
+      "grad_norm": 1.2734375,
+      "learning_rate": 6.869950425636095e-08,
+      "loss": 1.6955,
+      "step": 960
+    },
+    {
+      "epoch": 2.6584022038567494,
+      "grad_norm": 1.265625,
+      "learning_rate": 6.353321347613815e-08,
+      "loss": 1.6962,
+      "step": 965
+    },
+    {
+      "epoch": 2.672176308539945,
+      "grad_norm": 1.3828125,
+      "learning_rate": 5.856247614814292e-08,
+      "loss": 1.6914,
+      "step": 970
+    },
+    {
+      "epoch": 2.6859504132231407,
+      "grad_norm": 1.296875,
+      "learning_rate": 5.3788330264174506e-08,
+      "loss": 1.6934,
+      "step": 975
+    },
+    {
+      "epoch": 2.6997245179063363,
+      "grad_norm": 1.28125,
+      "learning_rate": 4.921177276371069e-08,
+      "loss": 1.6947,
+      "step": 980
+    },
+    {
+      "epoch": 2.7134986225895315,
+      "grad_norm": 1.2890625,
+      "learning_rate": 4.483375932572597e-08,
+      "loss": 1.6929,
+      "step": 985
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 1.2890625,
+      "learning_rate": 4.0655204169127156e-08,
+      "loss": 1.6944,
+      "step": 990
+    },
+    {
+      "epoch": 2.7410468319559227,
+      "grad_norm": 1.2734375,
+      "learning_rate": 3.667697986184526e-08,
+      "loss": 1.6898,
+      "step": 995
+    },
+    {
+      "epoch": 2.7548209366391183,
+      "grad_norm": 1.2578125,
+      "learning_rate": 3.2899917138625055e-08,
+      "loss": 1.7061,
+      "step": 1000
+    },
+    {
+      "epoch": 2.768595041322314,
+      "grad_norm": 1.3125,
+      "learning_rate": 2.9324804727551055e-08,
+      "loss": 1.6974,
+      "step": 1005
+    },
+    {
+      "epoch": 2.7823691460055096,
+      "grad_norm": 1.2734375,
+      "learning_rate": 2.5952389185344925e-08,
+      "loss": 1.6892,
+      "step": 1010
+    },
+    {
+      "epoch": 2.796143250688705,
+      "grad_norm": 1.2421875,
+      "learning_rate": 2.2783374741469186e-08,
+      "loss": 1.696,
+      "step": 1015
+    },
+    {
+      "epoch": 2.809917355371901,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.9818423151069406e-08,
+      "loss": 1.6879,
+      "step": 1020
+    },
+    {
+      "epoch": 2.8236914600550964,
+      "grad_norm": 1.3125,
+      "learning_rate": 1.705815355678619e-08,
+      "loss": 1.6943,
+      "step": 1025
+    },
+    {
+      "epoch": 2.837465564738292,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.4503142359465925e-08,
+      "loss": 1.6919,
+      "step": 1030
+    },
+    {
+      "epoch": 2.8512396694214877,
+      "grad_norm": 1.2421875,
+      "learning_rate": 1.215392309779617e-08,
+      "loss": 1.6907,
+      "step": 1035
+    },
+    {
+      "epoch": 2.8650137741046833,
+      "grad_norm": 1.2734375,
+      "learning_rate": 1.0010986336891458e-08,
+      "loss": 1.704,
+      "step": 1040
+    },
+    {
+      "epoch": 2.878787878787879,
+      "grad_norm": 1.2578125,
+      "learning_rate": 8.074779565854117e-09,
+      "loss": 1.691,
+      "step": 1045
+    },
+    {
+      "epoch": 2.8925619834710745,
+      "grad_norm": 1.3203125,
+      "learning_rate": 6.34570710432869e-09,
+      "loss": 1.6975,
+      "step": 1050
+    },
+    {
+      "epoch": 2.90633608815427,
+      "grad_norm": 1.2734375,
+      "learning_rate": 4.824130018072026e-09,
+      "loss": 1.6918,
+      "step": 1055
+    },
+    {
+      "epoch": 2.9201101928374653,
+      "grad_norm": 1.2890625,
+      "learning_rate": 3.5103660435551465e-09,
+      "loss": 1.6933,
+      "step": 1060
+    },
+    {
+      "epoch": 2.9338842975206614,
+      "grad_norm": 1.3046875,
+      "learning_rate": 2.4046895216136563e-09,
+      "loss": 1.6872,
+      "step": 1065
+    },
+    {
+      "epoch": 2.9476584022038566,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.5073313401594568e-09,
+      "loss": 1.696,
+      "step": 1070
+    },
+    {
+      "epoch": 2.9614325068870526,
+      "grad_norm": 1.234375,
+      "learning_rate": 8.184788859667557e-10,
+      "loss": 1.6964,
+      "step": 1075
+    },
+    {
+      "epoch": 2.975206611570248,
+      "grad_norm": 1.3203125,
+      "learning_rate": 3.3827600554170444e-10,
+      "loss": 1.6941,
+      "step": 1080
+    },
+    {
+      "epoch": 2.9889807162534434,
+      "grad_norm": 1.3125,
+      "learning_rate": 6.682297508464608e-11,
+      "loss": 1.6993,
+      "step": 1085
+    },
+    {
+      "epoch": 3.0,
+      "step": 1089,
+      "total_flos": 6.427401199279931e+18,
+      "train_loss": 1.711617823146263,
+      "train_runtime": 5145.0339,
+      "train_samples_per_second": 13.545,
+      "train_steps_per_second": 0.212
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1089,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 300,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.427401199279931e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}