diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/checkpoint-15260/config.json b/checkpoint-15260/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09
--- /dev/null
+++ b/checkpoint-15260/config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "meta-llama/Llama-3.1-8B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/checkpoint-15260/generation_config.json b/checkpoint-15260/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507
--- /dev/null
+++ b/checkpoint-15260/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.47.0.dev0"
+}
diff --git a/checkpoint-15260/model-00001-of-00007.safetensors b/checkpoint-15260/model-00001-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..897d8016fa0e1e5250a8e563879dca3cb5c2949b
--- /dev/null
+++ b/checkpoint-15260/model-00001-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e7546a0b591d1aebcb5a7a4ddc93298af1cc66f8cc0005a361a3725e93ae6f0
+size 4886466168
diff --git a/checkpoint-15260/model-00002-of-00007.safetensors b/checkpoint-15260/model-00002-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961
--- /dev/null
+++ b/checkpoint-15260/model-00002-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64
+size 4832007448
diff --git a/checkpoint-15260/model-00003-of-00007.safetensors b/checkpoint-15260/model-00003-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff
--- /dev/null
+++ b/checkpoint-15260/model-00003-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97
+size 4999813112
diff --git a/checkpoint-15260/model-00004-of-00007.safetensors b/checkpoint-15260/model-00004-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a
--- /dev/null
+++ b/checkpoint-15260/model-00004-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042
+size 4999813128
diff --git a/checkpoint-15260/model-00005-of-00007.safetensors b/checkpoint-15260/model-00005-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89
--- /dev/null
+++ b/checkpoint-15260/model-00005-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7
+size 4832007496
diff --git a/checkpoint-15260/model-00006-of-00007.safetensors b/checkpoint-15260/model-00006-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ca407cbae64cc8f0030b7f31acf2243bf4d1ab3e
--- /dev/null
+++ b/checkpoint-15260/model-00006-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df5d1993f96687d3eccecd21575ee99959b3c37e3e2fa3aa80d418fb95cbc368
+size 4999813120
diff --git a/checkpoint-15260/model-00007-of-00007.safetensors b/checkpoint-15260/model-00007-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..90f37e0f21263f934b2e15e7150e4e700babe776
--- /dev/null
+++ b/checkpoint-15260/model-00007-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b19907b3ea724b991bedf1922624123ba04d5ae5406f392a3d86198358d3ff2
+size 2571158184
diff --git a/checkpoint-15260/model.safetensors.index.json b/checkpoint-15260/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13
--- /dev/null
+++ b/checkpoint-15260/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 32121044992
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.norm.weight": "model-00007-of-00007.safetensors"
+  }
+}
diff --git a/checkpoint-15260/optimizer.pt b/checkpoint-15260/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..954e7bb315aede4edbaa970b037ee355869f944c
--- /dev/null
+++ b/checkpoint-15260/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca25d4f5128b30a1e43381a6319446763e5218dd8258ae8a8d5278a3f895ed4
+size 15385036334
diff --git a/checkpoint-15260/rng_state.pth b/checkpoint-15260/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c
--- /dev/null
+++ b/checkpoint-15260/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244
diff --git a/checkpoint-15260/scheduler.pt b/checkpoint-15260/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..38f65fadce527ecb01fb7d7fd7ef901b9b082813
--- /dev/null
+++ b/checkpoint-15260/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ce335347efaebf038b8915bc0a8d2d587a4a1aa08ad0e015b4bc7cc4fba634e
+size 1064
diff --git a/checkpoint-15260/trainer_state.json b/checkpoint-15260/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7afd2d7cb04092f504d209b647e1edebebb69382
--- /dev/null
+++ b/checkpoint-15260/trainer_state.json
@@ -0,0 +1,3477 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2203546467972044,
+  "eval_steps": 500,
+  "global_step": 15260,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0004476405013573615,
+      "grad_norm": 4.6696085929870605,
+      "learning_rate": 1.0157273918741808e-06,
+      "loss": 0.9366,
+      "step": 31
+    },
+    {
+      "epoch": 0.000895281002714723,
+      "grad_norm": 4.250915050506592,
+      "learning_rate": 2.0314547837483616e-06,
+      "loss": 0.9002,
+      "step": 62
+    },
+    {
+      "epoch": 0.0013429215040720846,
+      "grad_norm": 4.424270153045654,
+      "learning_rate": 3.0471821756225426e-06,
+      "loss": 0.8843,
+      "step": 93
+    },
+    {
+      "epoch": 0.001790562005429446,
+      "grad_norm": 4.56964635848999,
+      "learning_rate": 4.062909567496723e-06,
+      "loss": 0.8717,
+      "step": 124
+    },
+    {
+      "epoch": 0.0022382025067868077,
+      "grad_norm": 4.051624298095703,
+      "learning_rate": 5.078636959370905e-06,
+      "loss": 0.8711,
+      "step": 155
+    },
+    {
+      "epoch": 0.002685843008144169,
+      "grad_norm": 3.98006272315979,
+      "learning_rate": 6.094364351245085e-06,
+      "loss": 0.8628,
+      "step": 186
+    },
+    {
+      "epoch": 0.0031334835095015307,
+      "grad_norm": 4.4158406257629395,
+      "learning_rate": 7.110091743119267e-06,
+      "loss": 0.871,
+      "step": 217
+    },
+    {
+      "epoch": 0.003581124010858892,
+      "grad_norm": 4.681333541870117,
+      "learning_rate": 8.125819134993446e-06,
+      "loss": 0.8593,
+      "step": 248
+    },
+    {
+      "epoch": 0.004028764512216254,
+      "grad_norm": 3.8057820796966553,
+      "learning_rate": 9.141546526867629e-06,
+      "loss": 0.8558,
+      "step": 279
+    },
+    {
+      "epoch": 0.0044764050135736155,
+      "grad_norm": 4.523633003234863,
+      "learning_rate": 1.015727391874181e-05,
+      "loss": 0.8676,
+      "step": 310
+    },
+    {
+      "epoch": 0.0049240455149309765,
+      "grad_norm": 3.7387187480926514,
+      "learning_rate": 1.117300131061599e-05,
+      "loss": 0.8585,
+      "step": 341
+    },
+    {
+      "epoch": 0.005371686016288338,
+      "grad_norm": 4.187750816345215,
+      "learning_rate": 1.218872870249017e-05,
+      "loss": 0.8592,
+      "step": 372
+    },
+    {
+      "epoch": 0.005819326517645699,
+      "grad_norm": 3.782883644104004,
+      "learning_rate": 1.3204456094364351e-05,
+      "loss": 0.8449,
+      "step": 403
+    },
+    {
+      "epoch": 0.006266967019003061,
+      "grad_norm": 3.577796459197998,
+      "learning_rate": 1.4220183486238533e-05,
+      "loss": 0.8418,
+      "step": 434
+    },
+    {
+      "epoch": 0.006714607520360423,
+      "grad_norm": 3.1408321857452393,
+      "learning_rate": 1.5235910878112714e-05,
+      "loss": 0.8577,
+      "step": 465
+    },
+    {
+      "epoch": 0.007162248021717784,
+      "grad_norm": 4.090081691741943,
+      "learning_rate": 1.6251638269986893e-05,
+      "loss": 0.8439,
+      "step": 496
+    },
+    {
+      "epoch": 0.007609888523075146,
+      "grad_norm": 2.7458200454711914,
+      "learning_rate": 1.7267365661861077e-05,
+      "loss": 0.8468,
+      "step": 527
+    },
+    {
+      "epoch": 0.008057529024432507,
+      "grad_norm": 3.703225612640381,
+      "learning_rate": 1.8283093053735257e-05,
+      "loss": 0.8385,
+      "step": 558
+    },
+    {
+      "epoch": 0.008505169525789868,
+      "grad_norm": 3.134650230407715,
+      "learning_rate": 1.9298820445609438e-05,
+      "loss": 0.8418,
+      "step": 589
+    },
+    {
+      "epoch": 0.008952810027147231,
+      "grad_norm": 3.762680768966675,
+      "learning_rate": 2.031454783748362e-05,
+      "loss": 0.8312,
+      "step": 620
+    },
+    {
+      "epoch": 0.009400450528504592,
+      "grad_norm": 3.751004457473755,
+      "learning_rate": 2.13302752293578e-05,
+      "loss": 0.8251,
+      "step": 651
+    },
+    {
+      "epoch": 0.009848091029861953,
+      "grad_norm": 3.2268712520599365,
+      "learning_rate": 2.234600262123198e-05,
+      "loss": 0.8369,
+      "step": 682
+    },
+    {
+      "epoch": 0.010295731531219316,
+      "grad_norm": 3.5854289531707764,
+      "learning_rate": 2.336173001310616e-05,
+      "loss": 0.826,
+      "step": 713
+    },
+    {
+      "epoch": 0.010743372032576677,
+      "grad_norm": 3.9910435676574707,
+      "learning_rate": 2.437745740498034e-05,
+      "loss": 0.8168,
+      "step": 744
+    },
+    {
+      "epoch": 0.011191012533934038,
+      "grad_norm": 3.3059303760528564,
+      "learning_rate": 2.5393184796854525e-05,
+      "loss": 0.8269,
+      "step": 775
+    },
+    {
+      "epoch": 0.011638653035291399,
+      "grad_norm": 3.4081811904907227,
+      "learning_rate": 2.6408912188728702e-05,
+      "loss": 0.817,
+      "step": 806
+    },
+    {
+      "epoch": 0.012086293536648762,
+      "grad_norm": 3.2740163803100586,
+      "learning_rate": 2.7424639580602886e-05,
+      "loss": 0.8195,
+      "step": 837
+    },
+    {
+      "epoch": 0.012533934038006123,
+      "grad_norm": 2.7206223011016846,
+      "learning_rate": 2.8440366972477066e-05,
+      "loss": 0.8188,
+      "step": 868
+    },
+    {
+      "epoch": 0.012981574539363484,
+      "grad_norm": 2.7005629539489746,
+      "learning_rate": 2.9456094364351244e-05,
+      "loss": 0.8127,
+      "step": 899
+    },
+    {
+      "epoch": 0.013429215040720846,
+      "grad_norm": 2.970745325088501,
+      "learning_rate": 3.0471821756225428e-05,
+      "loss": 0.8126,
+      "step": 930
+    },
+    {
+      "epoch": 0.013876855542078207,
+      "grad_norm": 2.4761953353881836,
+      "learning_rate": 3.148754914809961e-05,
+      "loss": 0.82,
+      "step": 961
+    },
+    {
+      "epoch": 0.014324496043435568,
+      "grad_norm": 2.8555397987365723,
+      "learning_rate": 3.2503276539973785e-05,
+      "loss": 0.8166,
+      "step": 992
+    },
+    {
+      "epoch": 0.01477213654479293,
+      "grad_norm": 2.8124194145202637,
+      "learning_rate": 3.351900393184797e-05,
+      "loss": 0.8057,
+      "step": 1023
+    },
+    {
+      "epoch": 0.015219777046150292,
+      "grad_norm": 2.353851556777954,
+      "learning_rate": 3.453473132372215e-05,
+      "loss": 0.8064,
+      "step": 1054
+    },
+    {
+      "epoch": 0.015667417547507653,
+      "grad_norm": 3.0127620697021484,
+      "learning_rate": 3.555045871559633e-05,
+      "loss": 0.8086,
+      "step": 1085
+    },
+    {
+      "epoch": 0.016115058048865014,
+      "grad_norm": 2.792686939239502,
+      "learning_rate": 3.6566186107470514e-05,
+      "loss": 0.8152,
+      "step": 1116
+    },
+    {
+      "epoch": 0.016562698550222375,
+      "grad_norm": 2.407134532928467,
+      "learning_rate": 3.7581913499344695e-05,
+      "loss": 0.7949,
+      "step": 1147
+    },
+    {
+      "epoch": 0.017010339051579736,
+      "grad_norm": 2.6921393871307373,
+      "learning_rate": 3.8597640891218876e-05,
+      "loss": 0.804,
+      "step": 1178
+    },
+    {
+      "epoch": 0.0174579795529371,
+      "grad_norm": 2.3015975952148438,
+      "learning_rate": 3.9613368283093056e-05,
+      "loss": 0.7944,
+      "step": 1209
+    },
+    {
+      "epoch": 0.017905620054294462,
+      "grad_norm": 2.8116579055786133,
+      "learning_rate": 4.062909567496724e-05,
+      "loss": 0.7977,
+      "step": 1240
+    },
+    {
+      "epoch": 0.018353260555651823,
+      "grad_norm": 2.5720036029815674,
+      "learning_rate": 4.164482306684142e-05,
+      "loss": 0.7854,
+      "step": 1271
+    },
+    {
+      "epoch": 0.018800901057009184,
+      "grad_norm": 2.0802650451660156,
+      "learning_rate": 4.26605504587156e-05,
+      "loss": 0.7892,
+      "step": 1302
+    },
+    {
+      "epoch": 0.019248541558366545,
+      "grad_norm": 2.4343624114990234,
+      "learning_rate": 4.367627785058978e-05,
+      "loss": 0.7897,
+      "step": 1333
+    },
+    {
+      "epoch": 0.019696182059723906,
+      "grad_norm": 2.509686231613159,
+      "learning_rate": 4.469200524246396e-05,
+      "loss": 0.7855,
+      "step": 1364
+    },
+    {
+      "epoch": 0.020143822561081267,
+      "grad_norm": 2.626512289047241,
+      "learning_rate": 4.570773263433814e-05,
+      "loss": 0.7873,
+      "step": 1395
+    },
+    {
+      "epoch": 0.02059146306243863,
+      "grad_norm": 2.8619399070739746,
+      "learning_rate": 4.672346002621232e-05,
+      "loss": 0.7891,
+      "step": 1426
+    },
+    {
+      "epoch": 0.021039103563795993,
+      "grad_norm": 2.724792718887329,
+      "learning_rate": 4.77391874180865e-05,
+      "loss": 0.782,
+      "step": 1457
+    },
+    {
+      "epoch": 0.021486744065153354,
+      "grad_norm": 2.6659562587738037,
+      "learning_rate": 4.875491480996068e-05,
+      "loss": 0.7856,
+      "step": 1488
+    },
+    {
+      "epoch": 0.021934384566510715,
+      "grad_norm": 2.646078586578369,
+      "learning_rate": 4.977064220183487e-05,
+      "loss": 0.7748,
+      "step": 1519
+    },
+    {
+      "epoch": 0.022382025067868076,
+      "grad_norm": 2.429288387298584,
+      "learning_rate": 4.9999915451558777e-05,
+      "loss": 0.7722,
+      "step": 1550
+    },
+    {
+      "epoch": 0.022829665569225437,
+      "grad_norm": 1.9933409690856934,
+      "learning_rate": 4.999955597496219e-05,
+      "loss": 0.7874,
+      "step": 1581
+    },
+    {
+      "epoch": 0.023277306070582798,
+      "grad_norm": 2.314889907836914,
+      "learning_rate": 4.9998914381774255e-05,
+      "loss": 0.7757,
+      "step": 1612
+    },
+    {
+      "epoch": 0.023724946571940162,
+      "grad_norm": 2.2891199588775635,
+      "learning_rate": 4.999799067923527e-05,
+      "loss": 0.7713,
+      "step": 1643
+    },
+    {
+      "epoch": 0.024172587073297523,
+      "grad_norm": 2.4892444610595703,
+      "learning_rate": 4.999678487776908e-05,
+      "loss": 0.7687,
+      "step": 1674
+    },
+    {
+      "epoch": 0.024620227574654884,
+      "grad_norm": 2.3015685081481934,
+      "learning_rate": 4.9995296990983006e-05,
+      "loss": 0.7721,
+      "step": 1705
+    },
+    {
+      "epoch": 0.025067868076012245,
+      "grad_norm": 2.278954029083252,
+      "learning_rate": 4.999352703566763e-05,
+      "loss": 0.7741,
+      "step": 1736
+    },
+    {
+      "epoch": 0.025515508577369606,
+      "grad_norm": 1.7260370254516602,
+      "learning_rate": 4.999147503179668e-05,
+      "loss": 0.7681,
+      "step": 1767
+    },
+    {
+      "epoch": 0.025963149078726967,
+      "grad_norm": 2.0179309844970703,
+      "learning_rate": 4.998914100252672e-05,
+      "loss": 0.7604,
+      "step": 1798
+    },
+    {
+      "epoch": 0.02641078958008433,
+      "grad_norm": 2.53022837638855,
+      "learning_rate": 4.998652497419696e-05,
+      "loss": 0.7598,
+      "step": 1829
+    },
+    {
+      "epoch": 0.026858430081441693,
+      "grad_norm": 1.859253168106079,
+      "learning_rate": 4.9983626976328927e-05,
+      "loss": 0.7606,
+      "step": 1860
+    },
+    {
+      "epoch": 0.027306070582799054,
+      "grad_norm": 1.759303331375122,
+      "learning_rate": 4.998044704162613e-05,
+      "loss": 0.7532,
+      "step": 1891
+    },
+    {
+      "epoch": 0.027753711084156415,
+      "grad_norm": 2.4389419555664062,
+      "learning_rate": 4.9976985205973705e-05,
+      "loss": 0.7646,
+      "step": 1922
+    },
+    {
+      "epoch": 0.028201351585513776,
+      "grad_norm": 2.155348777770996,
+      "learning_rate": 4.997324150843799e-05,
+      "loss": 0.7569,
+      "step": 1953
+    },
+    {
+      "epoch": 0.028648992086871137,
+      "grad_norm": 2.0138537883758545,
+      "learning_rate": 4.99692159912661e-05,
+      "loss": 0.7677,
+      "step": 1984
+    },
+    {
+      "epoch": 0.029096632588228498,
+      "grad_norm": 2.5275282859802246,
+      "learning_rate": 4.996490869988546e-05,
+      "loss": 0.7519,
+      "step": 2015
+    },
+    {
+      "epoch": 0.02954427308958586,
+      "grad_norm": 1.8147333860397339,
+      "learning_rate": 4.996031968290326e-05,
+      "loss": 0.7509,
+      "step": 2046
+    },
+    {
+      "epoch": 0.029991913590943223,
+      "grad_norm": 2.1941769123077393,
+      "learning_rate": 4.995544899210594e-05,
+      "loss": 0.754,
+      "step": 2077
+    },
+    {
+      "epoch": 0.030439554092300584,
+      "grad_norm": 1.8953059911727905,
+      "learning_rate": 4.9950296682458583e-05,
+      "loss": 0.747,
+      "step": 2108
+    },
+    {
+      "epoch": 0.030887194593657945,
+      "grad_norm": 3.3973031044006348,
+      "learning_rate": 4.994486281210429e-05,
+      "loss": 0.7513,
+      "step": 2139
+    },
+    {
+      "epoch": 0.031334835095015307,
+      "grad_norm": 2.66795015335083,
+      "learning_rate": 4.9939147442363566e-05,
+      "loss": 0.7469,
+      "step": 2170
+    },
+    {
+      "epoch": 0.03178247559637267,
+      "grad_norm": 1.6254230737686157,
+      "learning_rate": 4.9933150637733574e-05,
+      "loss": 0.7297,
+      "step": 2201
+    },
+    {
+      "epoch": 0.03223011609773003,
+      "grad_norm": 1.822745680809021,
+      "learning_rate": 4.992687246588743e-05,
+      "loss": 0.754,
+      "step": 2232
+    },
+    {
+      "epoch": 0.03267775659908739,
+      "grad_norm": 1.6898781061172485,
+      "learning_rate": 4.992031299767347e-05,
+      "loss": 0.7478,
+      "step": 2263
+    },
+    {
+      "epoch": 0.03312539710044475,
+      "grad_norm": 1.799280047416687,
+      "learning_rate": 4.9913472307114386e-05,
+      "loss": 0.746,
+      "step": 2294
+    },
+    {
+      "epoch": 0.033573037601802115,
+      "grad_norm": 2.2501840591430664,
+      "learning_rate": 4.9906350471406446e-05,
+      "loss": 0.7408,
+      "step": 2325
+    },
+    {
+      "epoch": 0.03402067810315947,
+      "grad_norm": 2.3315324783325195,
+      "learning_rate": 4.989894757091861e-05,
+      "loss": 0.7301,
+      "step": 2356
+    },
+    {
+      "epoch": 0.03446831860451684,
+      "grad_norm": 1.5820438861846924,
+      "learning_rate": 4.989126368919158e-05,
+      "loss": 0.7305,
+      "step": 2387
+    },
+    {
+      "epoch": 0.0349159591058742,
+      "grad_norm": 2.5696022510528564,
+      "learning_rate": 4.988329891293693e-05,
+      "loss": 0.7337,
+      "step": 2418
+    },
+    {
+      "epoch": 0.03536359960723156,
+      "grad_norm": 1.8880938291549683,
+      "learning_rate": 4.987505333203608e-05,
+      "loss": 0.7385,
+      "step": 2449
+    },
+    {
+      "epoch": 0.035811240108588924,
+      "grad_norm": 2.6148738861083984,
+      "learning_rate": 4.9866527039539276e-05,
+      "loss": 0.7292,
+      "step": 2480
+    },
+    {
+      "epoch": 0.03625888060994628,
+      "grad_norm": 1.6925290822982788,
+      "learning_rate": 4.9857720131664594e-05,
+      "loss": 0.7344,
+      "step": 2511
+    },
+    {
+      "epoch": 0.036706521111303646,
+      "grad_norm": 1.7675210237503052,
+      "learning_rate": 4.9848632707796773e-05,
+      "loss": 0.7354,
+      "step": 2542
+    },
+    {
+      "epoch": 0.037154161612661,
+      "grad_norm": 2.1053173542022705,
+      "learning_rate": 4.9839264870486155e-05,
+      "loss": 0.7272,
+      "step": 2573
+    },
+    {
+      "epoch": 0.03760180211401837,
+      "grad_norm": 1.9718347787857056,
+      "learning_rate": 4.9829616725447526e-05,
+      "loss": 0.7336,
+      "step": 2604
+    },
+    {
+      "epoch": 0.03804944261537573,
+      "grad_norm": 1.5777671337127686,
+      "learning_rate": 4.981968838155888e-05,
+      "loss": 0.7182,
+      "step": 2635
+    },
+    {
+      "epoch": 0.03849708311673309,
+      "grad_norm": 1.905127763748169,
+      "learning_rate": 4.980947995086024e-05,
+      "loss": 0.7296,
+      "step": 2666
+    },
+    {
+      "epoch": 0.038944723618090454,
+      "grad_norm": 1.63962721824646,
+      "learning_rate": 4.979899154855234e-05,
+      "loss": 0.7249,
+      "step": 2697
+    },
+    {
+      "epoch": 0.03939236411944781,
+      "grad_norm": 1.584331750869751,
+      "learning_rate": 4.9788223292995386e-05,
+      "loss": 0.7345,
+      "step": 2728
+    },
+    {
+      "epoch": 0.039840004620805176,
+      "grad_norm": 1.9111014604568481,
+      "learning_rate": 4.977717530570768e-05,
+      "loss": 0.7225,
+      "step": 2759
+    },
+    {
+      "epoch": 0.040287645122162534,
+      "grad_norm": 2.3216073513031006,
+      "learning_rate": 4.976584771136425e-05,
+      "loss": 0.7207,
+      "step": 2790
+    },
+    {
+      "epoch": 0.0407352856235199,
+      "grad_norm": 1.6002410650253296,
+      "learning_rate": 4.975424063779547e-05,
+      "loss": 0.7228,
+      "step": 2821
+    },
+    {
+      "epoch": 0.04118292612487726,
+      "grad_norm": 2.104731798171997,
+      "learning_rate": 4.974235421598557e-05,
+      "loss": 0.7127,
+      "step": 2852
+    },
+    {
+      "epoch": 0.04163056662623462,
+      "grad_norm": 1.7114660739898682,
+      "learning_rate": 4.973018858007122e-05,
+      "loss": 0.7283,
+      "step": 2883
+    },
+    {
+      "epoch": 0.042078207127591985,
+      "grad_norm": 1.948133945465088,
+      "learning_rate": 4.9717743867339963e-05,
+      "loss": 0.7209,
+      "step": 2914
+    },
+    {
+      "epoch": 0.04252584762894934,
+      "grad_norm": 1.621764898300171,
+      "learning_rate": 4.9705020218228695e-05,
+      "loss": 0.7272,
+      "step": 2945
+    },
+    {
+      "epoch": 0.04297348813030671,
+      "grad_norm": 1.6967558860778809,
+      "learning_rate": 4.969201777632205e-05,
+      "loss": 0.7191,
+      "step": 2976
+    },
+    {
+      "epoch": 0.043421128631664065,
+      "grad_norm": 1.6656996011734009,
+      "learning_rate": 4.9678736688350846e-05,
+      "loss": 0.7205,
+      "step": 3007
+    },
+    {
+      "epoch": 0.04386876913302143,
+      "grad_norm": 2.151475191116333,
+      "learning_rate": 4.966517710419033e-05,
+      "loss": 0.7168,
+      "step": 3038
+    },
+    {
+      "epoch": 0.044316409634378794,
+      "grad_norm": 2.213109016418457,
+      "learning_rate": 4.965133917685858e-05,
+      "loss": 0.7139,
+      "step": 3069
+    },
+    {
+      "epoch": 0.04476405013573615,
+      "grad_norm": 1.5380377769470215,
+      "learning_rate": 4.9637223062514714e-05,
+      "loss": 0.7237,
+      "step": 3100
+    },
+    {
+      "epoch": 0.045211690637093516,
+      "grad_norm": 2.312377452850342,
+      "learning_rate": 4.962282892045718e-05,
+      "loss": 0.7156,
+      "step": 3131
+    },
+    {
+      "epoch": 0.04565933113845087,
+      "grad_norm": 1.7220717668533325,
+      "learning_rate": 4.9608156913121904e-05,
+      "loss": 0.7122,
+      "step": 3162
+    },
+    {
+      "epoch": 0.04610697163980824,
+      "grad_norm": 1.802856206893921,
+      "learning_rate": 4.959320720608049e-05,
+      "loss": 0.7128,
+      "step": 3193
+    },
+    {
+      "epoch": 0.046554612141165595,
+      "grad_norm": 1.6629964113235474,
+      "learning_rate": 4.9577979968038354e-05,
+      "loss": 0.7172,
+      "step": 3224
+    },
+    {
+      "epoch": 0.04700225264252296,
+      "grad_norm": 3.440115213394165,
+      "learning_rate": 4.956247537083282e-05,
+      "loss": 0.7213,
+      "step": 3255
+    },
+    {
+      "epoch": 0.047449893143880324,
+      "grad_norm": 1.5721139907836914,
+      "learning_rate": 4.9546693589431145e-05,
+      "loss": 0.7148,
+      "step": 3286
+    },
+    {
+      "epoch": 0.04789753364523768,
+      "grad_norm": 2.0920398235321045,
+      "learning_rate": 4.9530634801928595e-05,
+      "loss": 0.7145,
+      "step": 3317
+    },
+    {
+      "epoch": 0.048345174146595046,
+      "grad_norm": 1.666566014289856,
+      "learning_rate": 4.9514299189546395e-05,
+      "loss": 0.7095,
+      "step": 3348
+    },
+    {
+      "epoch": 0.048792814647952404,
+      "grad_norm": 1.8222129344940186,
+      "learning_rate": 4.949768693662973e-05,
+      "loss": 0.7138,
+      "step": 3379
+    },
+    {
+      "epoch": 0.04924045514930977,
+      "grad_norm": 1.7302964925765991,
+      "learning_rate": 4.948079823064559e-05,
+      "loss": 0.7017,
+      "step": 3410
+    },
+    {
+      "epoch": 0.049688095650667126,
+      "grad_norm": 1.7338463068008423,
+      "learning_rate": 4.946363326218074e-05,
+      "loss": 0.6979,
+      "step": 3441
+    },
+    {
+      "epoch": 0.05013573615202449,
+      "grad_norm": 1.5637450218200684,
+      "learning_rate": 4.9446192224939525e-05,
+      "loss": 0.7011,
+      "step": 3472
+    },
+    {
+      "epoch": 0.050583376653381855,
+      "grad_norm": 1.5632222890853882,
+      "learning_rate": 4.942847531574167e-05,
+      "loss": 0.704,
+      "step": 3503
+    },
+    {
+      "epoch": 0.05103101715473921,
+      "grad_norm": 1.588402509689331,
+      "learning_rate": 4.941048273452008e-05,
+      "loss": 0.7011,
+      "step": 3534
+    },
+    {
+      "epoch": 0.05147865765609658,
+      "grad_norm": 1.8840582370758057,
+      "learning_rate": 4.9392214684318605e-05,
+      "loss": 0.7016,
+      "step": 3565
+    },
+    {
+      "epoch": 0.051926298157453935,
+      "grad_norm": 1.2702268362045288,
+      "learning_rate": 4.93736713712897e-05,
+      "loss": 0.7004,
+      "step": 3596
+    },
+    {
+      "epoch": 0.0523739386588113,
+      "grad_norm": 1.3812692165374756,
+      "learning_rate": 4.9354853004692124e-05,
+      "loss": 0.7046,
+      "step": 3627
+    },
+    {
+      "epoch": 0.05282157916016866,
+      "grad_norm": 1.7257345914840698,
+      "learning_rate": 4.93357597968886e-05,
+      "loss": 0.6976,
+      "step": 3658
+    },
+    {
+      "epoch": 0.05326921966152602,
+      "grad_norm": 1.7458925247192383,
+      "learning_rate": 4.931639196334338e-05,
+      "loss": 0.6997,
+      "step": 3689
+    },
+    {
+      "epoch": 0.053716860162883386,
+      "grad_norm": 2.1996099948883057,
+      "learning_rate": 4.9296749722619826e-05,
+      "loss": 0.6991,
+      "step": 3720
+    },
+    {
+      "epoch": 0.05416450066424074,
+      "grad_norm": 1.6615021228790283,
+      "learning_rate": 4.9276833296377966e-05,
+      "loss": 0.7005,
+      "step": 3751
+    },
+    {
+      "epoch": 0.05461214116559811,
+      "grad_norm": 1.6276952028274536,
+      "learning_rate": 4.925664290937196e-05,
+      "loss": 0.7097,
+      "step": 3782
+    },
+    {
+      "epoch": 0.055059781666955465,
+      "grad_norm": 1.758227825164795,
+      "learning_rate": 4.9236178789447576e-05,
+      "loss": 0.6955,
+      "step": 3813
+    },
+    {
+      "epoch": 0.05550742216831283,
+      "grad_norm": 1.195280909538269,
+      "learning_rate": 4.921544116753962e-05,
+      "loss": 0.7073,
+      "step": 3844
+    },
+    {
+      "epoch": 0.05595506266967019,
+      "grad_norm": 1.6281015872955322,
+      "learning_rate": 4.919443027766935e-05,
+      "loss": 0.7022,
+      "step": 3875
+    },
+    {
+      "epoch": 0.05640270317102755,
+      "grad_norm": 1.3543150424957275,
+      "learning_rate": 4.91731463569418e-05,
+      "loss": 0.7036,
+      "step": 3906
+    },
+    {
+      "epoch": 0.056850343672384916,
+      "grad_norm": 2.16947078704834,
+      "learning_rate": 4.915158964554312e-05,
+      "loss": 0.7007,
+      "step": 3937
+    },
+    {
+      "epoch": 0.057297984173742274,
+      "grad_norm": 1.324578881263733,
+      "learning_rate": 4.912976038673786e-05,
+      "loss": 0.6941,
+      "step": 3968
+    },
+    {
+      "epoch": 0.05774562467509964,
+      "grad_norm": 1.9811108112335205,
+      "learning_rate": 4.9107658826866254e-05,
+      "loss": 0.6908,
+      "step": 3999
+    },
+    {
+      "epoch": 0.058193265176456996,
+      "grad_norm": 1.2975554466247559,
+      "learning_rate": 4.908528521534139e-05,
+      "loss": 0.6936,
+      "step": 4030
+    },
+    {
+      "epoch": 0.05864090567781436,
+      "grad_norm": 1.583282232284546,
+      "learning_rate": 4.906263980464644e-05,
+      "loss": 0.698,
+      "step": 4061
+    },
+    {
+      "epoch": 0.05908854617917172,
+      "grad_norm": 1.3532944917678833,
+      "learning_rate": 4.903972285033178e-05,
+      "loss": 0.7049,
+      "step": 4092
+    },
+    {
+      "epoch": 0.05953618668052908,
+      "grad_norm": 2.1245481967926025,
+      "learning_rate": 4.901653461101213e-05,
+      "loss": 0.7016,
+      "step": 4123
+    },
+    {
+      "epoch": 0.05998382718188645,
+      "grad_norm": 1.6913797855377197,
+      "learning_rate": 4.8993075348363626e-05,
+      "loss": 0.6981,
+      "step": 4154
+    },
+    {
+      "epoch": 0.060431467683243804,
+      "grad_norm": 1.51249098777771,
+      "learning_rate": 4.896934532712084e-05,
+      "loss": 0.6955,
+      "step": 4185
+    },
+    {
+      "epoch": 0.06087910818460117,
+      "grad_norm": 1.3880395889282227,
+      "learning_rate": 4.8945344815073846e-05,
+      "loss": 0.6934,
+      "step": 4216
+    },
+    {
+      "epoch": 0.061326748685958526,
+      "grad_norm": 1.6354159116744995,
+      "learning_rate": 4.892107408306516e-05,
+      "loss": 0.6938,
+      "step": 4247
+    },
+    {
+      "epoch": 0.06177438918731589,
+      "grad_norm": 2.126742362976074,
+      "learning_rate": 4.889653340498669e-05,
+      "loss": 0.7003,
+      "step": 4278
+    },
+    {
+      "epoch": 0.06222202968867325,
+      "grad_norm": 1.7903707027435303,
+      "learning_rate": 4.8871723057776664e-05,
+      "loss": 0.6885,
+      "step": 4309
+    },
+    {
+      "epoch": 0.06266967019003061,
+      "grad_norm": 1.537806510925293,
+      "learning_rate": 4.8846643321416476e-05,
+      "loss": 0.6892,
+      "step": 4340
+    },
+    {
+      "epoch": 0.06311731069138797,
+      "grad_norm": 1.6445434093475342,
+      "learning_rate": 4.882129447892753e-05,
+      "loss": 0.6843,
+      "step": 4371
+    },
+    {
+      "epoch": 0.06356495119274534,
+      "grad_norm": 1.555373191833496,
+      "learning_rate": 4.8795676816368076e-05,
+      "loss": 0.6899,
+      "step": 4402
+    },
+    {
+      "epoch": 0.0640125916941027,
+      "grad_norm": 1.8370277881622314,
+      "learning_rate": 4.876979062282995e-05,
+      "loss": 0.6813,
+      "step": 4433
+    },
+    {
+      "epoch": 0.06446023219546006,
+      "grad_norm": 1.3132514953613281,
+      "learning_rate": 4.8743636190435325e-05,
+      "loss": 0.6832,
+      "step": 4464
+    },
+    {
+      "epoch": 0.06490787269681741,
+      "grad_norm": 1.3186298608779907,
+      "learning_rate": 4.871721381433344e-05,
+      "loss": 0.6879,
+      "step": 4495
+    },
+    {
+      "epoch": 0.06535551319817479,
+      "grad_norm": 1.4360268115997314,
+      "learning_rate": 4.869052379269719e-05,
+      "loss": 0.69,
+      "step": 4526
+    },
+    {
+      "epoch": 0.06580315369953214,
+      "grad_norm": 1.670765995979309,
+      "learning_rate": 4.866356642671985e-05,
+      "loss": 0.6865,
+      "step": 4557
+    },
+    {
+      "epoch": 0.0662507942008895,
+      "grad_norm": 1.7548723220825195,
+      "learning_rate": 4.8636342020611634e-05,
+      "loss": 0.6852,
+      "step": 4588
+    },
+    {
+      "epoch": 0.06669843470224687,
+      "grad_norm": 1.5086426734924316,
+      "learning_rate": 4.860885088159626e-05,
+      "loss": 0.6894,
+      "step": 4619
+    },
+    {
+      "epoch": 0.06714607520360423,
+      "grad_norm": 1.3140665292739868,
+      "learning_rate": 4.858109331990751e-05,
+      "loss": 0.6812,
+      "step": 4650
+    },
+    {
+      "epoch": 0.06759371570496159,
+      "grad_norm": 1.4212454557418823,
+      "learning_rate": 4.855306964878567e-05,
+      "loss": 0.6872,
+      "step": 4681
+    },
+    {
+      "epoch": 0.06804135620631895,
+      "grad_norm": 1.3034414052963257,
+      "learning_rate": 4.8524780184474084e-05,
+      "loss": 0.6901,
+      "step": 4712
+    },
+    {
+      "epoch": 0.06848899670767632,
+      "grad_norm": 1.3741438388824463,
+      "learning_rate": 4.8496225246215496e-05,
+      "loss": 0.6875,
+      "step": 4743
+    },
+    {
+      "epoch": 0.06893663720903367,
+      "grad_norm": 1.7262542247772217,
+      "learning_rate": 4.8467405156248505e-05,
+      "loss": 0.6868,
+      "step": 4774
+    },
+    {
+      "epoch": 0.06938427771039103,
+      "grad_norm": 1.3293650150299072,
+      "learning_rate": 4.843832023980392e-05,
+      "loss": 0.6891,
+      "step": 4805
+    },
+    {
+      "epoch": 0.0698319182117484,
+      "grad_norm": 1.3448151350021362,
+      "learning_rate": 4.840897082510106e-05,
+      "loss": 0.6765,
+      "step": 4836
+    },
+    {
+      "epoch": 0.07027955871310576,
+      "grad_norm": 2.961280584335327,
+      "learning_rate": 4.8379357243344084e-05,
+      "loss": 0.6939,
+      "step": 4867
+    },
+    {
+      "epoch": 0.07072719921446312,
+      "grad_norm": 1.8265361785888672,
+      "learning_rate": 4.8349479828718236e-05,
+      "loss": 0.677,
+      "step": 4898
+    },
+    {
+      "epoch": 0.07117483971582048,
+      "grad_norm": 1.490349531173706,
+      "learning_rate": 4.8319338918386075e-05,
+      "loss": 0.6778,
+      "step": 4929
+    },
+    {
+      "epoch": 0.07162248021717785,
+      "grad_norm": 1.3669307231903076,
+      "learning_rate": 4.828893485248369e-05,
+      "loss": 0.6746,
+      "step": 4960
+    },
+    {
+      "epoch": 0.0720701207185352,
+      "grad_norm": 1.3995884656906128,
+      "learning_rate": 4.825826797411682e-05,
+      "loss": 0.6757,
+      "step": 4991
+    },
+    {
+      "epoch": 0.07251776121989256,
+      "grad_norm": 1.1217372417449951,
+      "learning_rate": 4.822733862935702e-05,
+      "loss": 0.6832,
+      "step": 5022
+    },
+    {
+      "epoch": 0.07296540172124993,
+      "grad_norm": 1.2192097902297974,
+      "learning_rate": 4.819614716723775e-05,
+      "loss": 0.6868,
+      "step": 5053
+    },
+    {
+      "epoch": 0.07341304222260729,
+      "grad_norm": 1.5045067071914673,
+      "learning_rate": 4.8164693939750425e-05,
+      "loss": 0.6793,
+      "step": 5084
+    },
+    {
+      "epoch": 0.07386068272396465,
+      "grad_norm": 1.7127234935760498,
+      "learning_rate": 4.813297930184042e-05,
+      "loss": 0.6797,
+      "step": 5115
+    },
+    {
+      "epoch": 0.074308323225322,
+      "grad_norm": 1.846561312675476,
+      "learning_rate": 4.810100361140314e-05,
+      "loss": 0.6767,
+      "step": 5146
+    },
+    {
+      "epoch": 0.07475596372667938,
+      "grad_norm": 1.3076797723770142,
+      "learning_rate": 4.8068767229279885e-05,
+      "loss": 0.6855,
+      "step": 5177
+    },
+    {
+      "epoch": 0.07520360422803674,
+      "grad_norm": 1.4170383214950562,
+      "learning_rate": 4.8036270519253854e-05,
+      "loss": 0.681,
+      "step": 5208
+    },
+    {
+      "epoch": 0.0756512447293941,
+      "grad_norm": 1.2504942417144775,
+      "learning_rate": 4.8003513848046e-05,
+      "loss": 0.6778,
+      "step": 5239
+    },
+    {
+      "epoch": 0.07609888523075146,
+      "grad_norm": 1.1522283554077148,
+      "learning_rate": 4.79704975853109e-05,
+      "loss": 0.6749,
+      "step": 5270
+    },
+    {
+      "epoch": 0.07654652573210882,
+      "grad_norm": 1.6351525783538818,
+      "learning_rate": 4.793722210363262e-05,
+      "loss": 0.6745,
+      "step": 5301
+    },
+    {
+      "epoch": 0.07699416623346618,
+      "grad_norm": 1.5093014240264893,
+      "learning_rate": 4.7903687778520414e-05,
+      "loss": 0.6747,
+      "step": 5332
+    },
+    {
+      "epoch": 0.07744180673482354,
+      "grad_norm": 1.362160563468933,
+      "learning_rate": 4.7869894988404593e-05,
+      "loss": 0.673,
+      "step": 5363
+    },
+    {
+      "epoch": 0.07788944723618091,
+      "grad_norm": 1.2021727561950684,
+      "learning_rate": 4.783584411463221e-05,
+      "loss": 0.6768,
+      "step": 5394
+    },
+    {
+      "epoch": 0.07833708773753827,
+      "grad_norm": 2.1543540954589844,
+      "learning_rate": 4.780153554146274e-05,
+      "loss": 0.672,
+      "step": 5425
+    },
+    {
+      "epoch": 0.07878472823889562,
+      "grad_norm": 1.882712721824646,
+      "learning_rate": 4.7766969656063766e-05,
+      "loss": 0.6926,
+      "step": 5456
+    },
+    {
+      "epoch": 0.079232368740253,
+      "grad_norm": 1.3975650072097778,
+      "learning_rate": 4.773214684850662e-05,
+      "loss": 0.6747,
+      "step": 5487
+    },
+    {
+      "epoch": 0.07968000924161035,
+      "grad_norm": 1.3912913799285889,
+      "learning_rate": 4.769706751176193e-05,
+      "loss": 0.6756,
+      "step": 5518
+    },
+    {
+      "epoch": 0.08012764974296771,
+      "grad_norm": 1.7227635383605957,
+      "learning_rate": 4.7661732041695264e-05,
+      "loss": 0.6694,
+      "step": 5549
+    },
+    {
+      "epoch": 0.08057529024432507,
+      "grad_norm": 1.3151129484176636,
+      "learning_rate": 4.762614083706258e-05,
+      "loss": 0.6715,
+      "step": 5580
+    },
+    {
+      "epoch": 0.08102293074568244,
+      "grad_norm": 1.0972425937652588,
+      "learning_rate": 4.759029429950581e-05,
+      "loss": 0.6661,
+      "step": 5611
+    },
+    {
+      "epoch": 0.0814705712470398,
+      "grad_norm": 1.2346575260162354,
+      "learning_rate": 4.7554192833548235e-05,
+      "loss": 0.66,
+      "step": 5642
+    },
+    {
+      "epoch": 0.08191821174839715,
+      "grad_norm": 1.4536516666412354,
+      "learning_rate": 4.751783684659e-05,
+      "loss": 0.6743,
+      "step": 5673
+    },
+    {
+      "epoch": 0.08236585224975453,
+      "grad_norm": 1.1361631155014038,
+      "learning_rate": 4.748122674890348e-05,
+      "loss": 0.6791,
+      "step": 5704
+    },
+    {
+      "epoch": 0.08281349275111188,
+      "grad_norm": 1.2605111598968506,
+      "learning_rate": 4.7444362953628654e-05,
+      "loss": 0.6797,
+      "step": 5735
+    },
+    {
+      "epoch": 0.08326113325246924,
+      "grad_norm": 1.2355903387069702,
+      "learning_rate": 4.7407245876768424e-05,
+      "loss": 0.6642,
+      "step": 5766
+    },
+    {
+      "epoch": 0.0837087737538266,
+      "grad_norm": 1.6677048206329346,
+      "learning_rate": 4.736987593718397e-05,
+      "loss": 0.6759,
+      "step": 5797
+    },
+    {
+      "epoch": 0.08415641425518397,
+      "grad_norm": 1.4781981706619263,
+      "learning_rate": 4.733225355658999e-05,
+      "loss": 0.6707,
+      "step": 5828
+    },
+    {
+      "epoch": 0.08460405475654133,
+      "grad_norm": 1.138583779335022,
+      "learning_rate": 4.7294379159549926e-05,
+      "loss": 0.6636,
+      "step": 5859
+    },
+    {
+      "epoch": 0.08505169525789869,
+      "grad_norm": 1.529036283493042,
+      "learning_rate": 4.725625317347119e-05,
+      "loss": 0.6705,
+      "step": 5890
+    },
+    {
+      "epoch": 0.08549933575925606,
+      "grad_norm": 1.3216760158538818,
+      "learning_rate": 4.7217876028600374e-05,
+      "loss": 0.6714,
+      "step": 5921
+    },
+    {
+      "epoch": 0.08594697626061341,
+      "grad_norm": 1.1820168495178223,
+      "learning_rate": 4.717924815801832e-05,
+      "loss": 0.6757,
+      "step": 5952
+    },
+    {
+      "epoch": 0.08639461676197077,
+      "grad_norm": 1.393571138381958,
+      "learning_rate": 4.714036999763532e-05,
+      "loss": 0.6672,
+      "step": 5983
+    },
+    {
+      "epoch": 0.08684225726332813,
+      "grad_norm": 1.4574682712554932,
+      "learning_rate": 4.7101241986186116e-05,
+      "loss": 0.6655,
+      "step": 6014
+    },
+    {
+      "epoch": 0.0872898977646855,
+      "grad_norm": 1.138645887374878,
+      "learning_rate": 4.7061864565225e-05,
+      "loss": 0.6663,
+      "step": 6045
+    },
+    {
+      "epoch": 0.08773753826604286,
+      "grad_norm": 1.7602777481079102,
+      "learning_rate": 4.702223817912081e-05,
+      "loss": 0.6695,
+      "step": 6076
+    },
+    {
+      "epoch": 0.08818517876740022,
+      "grad_norm": 1.2323459386825562,
+      "learning_rate": 4.698236327505195e-05,
+      "loss": 0.6636,
+      "step": 6107
+    },
+    {
+      "epoch": 0.08863281926875759,
+      "grad_norm": 1.6881431341171265,
+      "learning_rate": 4.694224030300127e-05,
+      "loss": 0.6653,
+      "step": 6138
+    },
+    {
+      "epoch": 0.08908045977011494,
+      "grad_norm": 1.391417384147644,
+      "learning_rate": 4.690186971575107e-05,
+      "loss": 0.6636,
+      "step": 6169
+    },
+    {
+      "epoch": 0.0895281002714723,
+      "grad_norm": 1.3066257238388062,
+      "learning_rate": 4.6861251968877916e-05,
+      "loss": 0.6777,
+      "step": 6200
+    },
+    {
+      "epoch": 0.08997574077282966,
+      "grad_norm": 1.2001326084136963,
+      "learning_rate": 4.68203875207476e-05,
+      "loss": 0.6683,
+      "step": 6231
+    },
+    {
+      "epoch": 0.09042338127418703,
+      "grad_norm": 1.4361172914505005,
+      "learning_rate": 4.677927683250983e-05,
+      "loss": 0.6634,
+      "step": 6262
+    },
+    {
+      "epoch": 0.09087102177554439,
+      "grad_norm": 8.04520320892334,
+      "learning_rate": 4.6737920368093156e-05,
+      "loss": 0.6752,
+      "step": 6293
+    },
+    {
+      "epoch": 0.09131866227690175,
+      "grad_norm": 1.4874210357666016,
+      "learning_rate": 4.669631859419965e-05,
+      "loss": 0.6733,
+      "step": 6324
+    },
+    {
+      "epoch": 0.09176630277825912,
+      "grad_norm": 1.234491229057312,
+      "learning_rate": 4.6654471980299676e-05,
+      "loss": 0.668,
+      "step": 6355
+    },
+    {
+      "epoch": 0.09221394327961648,
+      "grad_norm": 1.2088687419891357,
+      "learning_rate": 4.661238099862658e-05,
+      "loss": 0.6705,
+      "step": 6386
+    },
+    {
+      "epoch": 0.09266158378097383,
+      "grad_norm": 1.1937814950942993,
+      "learning_rate": 4.657004612417138e-05,
+      "loss": 0.6853,
+      "step": 6417
+    },
+    {
+      "epoch": 0.09310922428233119,
+      "grad_norm": 1.5205374956130981,
+      "learning_rate": 4.6527467834677374e-05,
+      "loss": 0.685,
+      "step": 6448
+    },
+    {
+      "epoch": 0.09355686478368856,
+      "grad_norm": 1.2221660614013672,
+      "learning_rate": 4.648464661063478e-05,
+      "loss": 0.6622,
+      "step": 6479
+    },
+    {
+      "epoch": 0.09400450528504592,
+      "grad_norm": 1.0762608051300049,
+      "learning_rate": 4.6441582935275264e-05,
+      "loss": 0.669,
+      "step": 6510
+    },
+    {
+      "epoch": 0.09445214578640328,
+      "grad_norm": 1.4416946172714233,
+      "learning_rate": 4.6398277294566586e-05,
+      "loss": 0.6674,
+      "step": 6541
+    },
+    {
+      "epoch": 0.09489978628776065,
+      "grad_norm": 1.559158205986023,
+      "learning_rate": 4.6354730177207e-05,
+      "loss": 0.6681,
+      "step": 6572
+    },
+    {
+      "epoch": 0.095347426789118,
+      "grad_norm": 1.3833891153335571,
+      "learning_rate": 4.6310942074619787e-05,
+      "loss": 0.6681,
+      "step": 6603
+    },
+    {
+      "epoch": 0.09579506729047536,
+      "grad_norm": 1.6753300428390503,
+      "learning_rate": 4.626691348094777e-05,
+      "loss": 0.6658,
+      "step": 6634
+    },
+    {
+      "epoch": 0.09624270779183272,
+      "grad_norm": 1.951198697090149,
+      "learning_rate": 4.622264489304762e-05,
+      "loss": 0.6654,
+      "step": 6665
+    },
+    {
+      "epoch": 0.09669034829319009,
+      "grad_norm": 1.2356919050216675,
+      "learning_rate": 4.617813681048434e-05,
+      "loss": 0.6651,
+      "step": 6696
+    },
+    {
+      "epoch": 0.09713798879454745,
+      "grad_norm": 1.2712593078613281,
+      "learning_rate": 4.61333897355256e-05,
+      "loss": 0.6646,
+      "step": 6727
+    },
+    {
+      "epoch": 0.09758562929590481,
+      "grad_norm": 1.1935900449752808,
+      "learning_rate": 4.608840417313604e-05,
+      "loss": 0.674,
+      "step": 6758
+    },
+    {
+      "epoch": 0.09803326979726218,
+      "grad_norm": 1.1649430990219116,
+      "learning_rate": 4.6043180630971646e-05,
+      "loss": 0.6644,
+      "step": 6789
+    },
+    {
+      "epoch": 0.09848091029861954,
+      "grad_norm": 1.4281456470489502,
+      "learning_rate": 4.599771961937391e-05,
+      "loss": 0.6673,
+      "step": 6820
+    },
+    {
+      "epoch": 0.0989285507999769,
+      "grad_norm": 1.3064521551132202,
+      "learning_rate": 4.5952021651364204e-05,
+      "loss": 0.6584,
+      "step": 6851
+    },
+    {
+      "epoch": 0.09937619130133425,
+      "grad_norm": 1.2546554803848267,
+      "learning_rate": 4.590608724263786e-05,
+      "loss": 0.6612,
+      "step": 6882
+    },
+    {
+      "epoch": 0.09982383180269162,
+      "grad_norm": 1.1866974830627441,
+      "learning_rate": 4.585991691155845e-05,
+      "loss": 0.6612,
+      "step": 6913
+    },
+    {
+      "epoch": 0.10027147230404898,
+      "grad_norm": 1.6166640520095825,
+      "learning_rate": 4.581351117915188e-05,
+      "loss": 0.6551,
+      "step": 6944
+    },
+    {
+      "epoch": 0.10071911280540634,
+      "grad_norm": 1.5471700429916382,
+      "learning_rate": 4.5766870569100534e-05,
+      "loss": 0.6607,
+      "step": 6975
+    },
+    {
+      "epoch": 0.10116675330676371,
+      "grad_norm": 1.3361026048660278,
+      "learning_rate": 4.571999560773736e-05,
+      "loss": 0.666,
+      "step": 7006
+    },
+    {
+      "epoch": 0.10161439380812107,
+      "grad_norm": 1.2938140630722046,
+      "learning_rate": 4.5672886824039915e-05,
+      "loss": 0.6547,
+      "step": 7037
+    },
+    {
+      "epoch": 0.10206203430947842,
+      "grad_norm": 1.2688400745391846,
+      "learning_rate": 4.5625544749624435e-05,
+      "loss": 0.6624,
+      "step": 7068
+    },
+    {
+      "epoch": 0.10250967481083578,
+      "grad_norm": 1.6306285858154297,
+      "learning_rate": 4.5577969918739794e-05,
+      "loss": 0.6627,
+      "step": 7099
+    },
+    {
+      "epoch": 0.10295731531219315,
+      "grad_norm": 1.3346176147460938,
+      "learning_rate": 4.5530162868261486e-05,
+      "loss": 0.6577,
+      "step": 7130
+    },
+    {
+      "epoch": 0.10340495581355051,
+      "grad_norm": 1.0933984518051147,
+      "learning_rate": 4.548212413768558e-05,
+      "loss": 0.6602,
+      "step": 7161
+    },
+    {
+      "epoch": 0.10385259631490787,
+      "grad_norm": 1.575859785079956,
+      "learning_rate": 4.543385426912261e-05,
+      "loss": 0.6593,
+      "step": 7192
+    },
+    {
+      "epoch": 0.10430023681626524,
+      "grad_norm": 1.4265861511230469,
+      "learning_rate": 4.53853538072915e-05,
+      "loss": 0.6564,
+      "step": 7223
+    },
+    {
+      "epoch": 0.1047478773176226,
+      "grad_norm": 1.737012267112732,
+      "learning_rate": 4.533662329951336e-05,
+      "loss": 0.6593,
+      "step": 7254
+    },
+    {
+      "epoch": 0.10519551781897996,
+      "grad_norm": 1.0257115364074707,
+      "learning_rate": 4.528766329570536e-05,
+      "loss": 0.6514,
+      "step": 7285
+    },
+    {
+      "epoch": 0.10564315832033731,
+      "grad_norm": 1.5043773651123047,
+      "learning_rate": 4.523847434837447e-05,
+      "loss": 0.6635,
+      "step": 7316
+    },
+    {
+      "epoch": 0.10609079882169468,
+      "grad_norm": 1.5642234086990356,
+      "learning_rate": 4.518905701261128e-05,
+      "loss": 0.6558,
+      "step": 7347
+    },
+    {
+      "epoch": 0.10653843932305204,
+      "grad_norm": 1.1821067333221436,
+      "learning_rate": 4.5139411846083715e-05,
+      "loss": 0.6686,
+      "step": 7378
+    },
+    {
+      "epoch": 0.1069860798244094,
+      "grad_norm": 1.5492759943008423,
+      "learning_rate": 4.508953940903073e-05,
+      "loss": 0.6543,
+      "step": 7409
+    },
+    {
+      "epoch": 0.10743372032576677,
+      "grad_norm": 1.281914234161377,
+      "learning_rate": 4.5039440264255994e-05,
+      "loss": 0.6516,
+      "step": 7440
+    },
+    {
+      "epoch": 0.10788136082712413,
+      "grad_norm": 1.3318305015563965,
+      "learning_rate": 4.498911497712155e-05,
+      "loss": 0.656,
+      "step": 7471
+    },
+    {
+      "epoch": 0.10832900132848149,
+      "grad_norm": 1.3832449913024902,
+      "learning_rate": 4.493856411554142e-05,
+      "loss": 0.6475,
+      "step": 7502
+    },
+    {
+      "epoch": 0.10877664182983884,
+      "grad_norm": 1.3547158241271973,
+      "learning_rate": 4.4887788249975206e-05,
+      "loss": 0.6594,
+      "step": 7533
+    },
+    {
+      "epoch": 0.10922428233119622,
+      "grad_norm": 1.4633681774139404,
+      "learning_rate": 4.4836787953421656e-05,
+      "loss": 0.6707,
+      "step": 7564
+    },
+    {
+      "epoch": 0.10967192283255357,
+      "grad_norm": 1.1781059503555298,
+      "learning_rate": 4.478556380141218e-05,
+      "loss": 0.6626,
+      "step": 7595
+    },
+    {
+      "epoch": 0.11011956333391093,
+      "grad_norm": 1.4727883338928223,
+      "learning_rate": 4.4734116372004375e-05,
+      "loss": 0.6535,
+      "step": 7626
+    },
+    {
+      "epoch": 0.1105672038352683,
+      "grad_norm": 1.3888640403747559,
+      "learning_rate": 4.4682446245775477e-05,
+      "loss": 0.6606,
+      "step": 7657
+    },
+    {
+      "epoch": 0.11101484433662566,
+      "grad_norm": 1.308769941329956,
+      "learning_rate": 4.463055400581586e-05,
+      "loss": 0.6667,
+      "step": 7688
+    },
+    {
+      "epoch": 0.11146248483798302,
+      "grad_norm": 1.3579630851745605,
+      "learning_rate": 4.4578440237722374e-05,
+      "loss": 0.6621,
+      "step": 7719
+    },
+    {
+      "epoch": 0.11191012533934037,
+      "grad_norm": 1.1285645961761475,
+      "learning_rate": 4.452610552959183e-05,
+      "loss": 0.6597,
+      "step": 7750
+    },
+    {
+      "epoch": 0.11235776584069775,
+      "grad_norm": 1.1144675016403198,
+      "learning_rate": 4.447355047201428e-05,
+      "loss": 0.6638,
+      "step": 7781
+    },
+    {
+      "epoch": 0.1128054063420551,
+      "grad_norm": 1.1993658542633057,
+      "learning_rate": 4.4420775658066414e-05,
+      "loss": 0.6704,
+      "step": 7812
+    },
+    {
+      "epoch": 0.11325304684341246,
+      "grad_norm": 1.0608967542648315,
+      "learning_rate": 4.436778168330484e-05,
+      "loss": 0.6573,
+      "step": 7843
+    },
+    {
+      "epoch": 0.11370068734476983,
+      "grad_norm": 1.1210070848464966,
+      "learning_rate": 4.4314569145759353e-05,
+      "loss": 0.6612,
+      "step": 7874
+    },
+    {
+      "epoch": 0.11414832784612719,
+      "grad_norm": 1.2345409393310547,
+      "learning_rate": 4.42611386459262e-05,
+      "loss": 0.65,
+      "step": 7905
+    },
+    {
+      "epoch": 0.11459596834748455,
+      "grad_norm": 1.077025294303894,
+      "learning_rate": 4.420749078676133e-05,
+      "loss": 0.6595,
+      "step": 7936
+    },
+    {
+      "epoch": 0.1150436088488419,
+      "grad_norm": 1.2079277038574219,
+      "learning_rate": 4.4153626173673516e-05,
+      "loss": 0.6442,
+      "step": 7967
+    },
+    {
+      "epoch": 0.11549124935019928,
+      "grad_norm": 1.6710035800933838,
+      "learning_rate": 4.409954541451762e-05,
+      "loss": 0.663,
+      "step": 7998
+    },
+    {
+      "epoch": 0.11593888985155663,
+      "grad_norm": 1.3124401569366455,
+      "learning_rate": 4.404524911958764e-05,
+      "loss": 0.6512,
+      "step": 8029
+    },
+    {
+      "epoch": 0.11638653035291399,
+      "grad_norm": 1.644904375076294,
+      "learning_rate": 4.399073790160989e-05,
+      "loss": 0.6587,
+      "step": 8060
+    },
+    {
+      "epoch": 0.11683417085427136,
+      "grad_norm": 1.181624174118042,
+      "learning_rate": 4.393601237573607e-05,
+      "loss": 0.653,
+      "step": 8091
+    },
+    {
+      "epoch": 0.11728181135562872,
+      "grad_norm": 1.4587918519973755,
+      "learning_rate": 4.388107315953628e-05,
+      "loss": 0.675,
+      "step": 8122
+    },
+    {
+      "epoch": 0.11772945185698608,
+      "grad_norm": 1.2147635221481323,
+      "learning_rate": 4.382592087299212e-05,
+      "loss": 0.6521,
+      "step": 8153
+    },
+    {
+      "epoch": 0.11817709235834344,
+      "grad_norm": 1.0448981523513794,
+      "learning_rate": 4.377055613848964e-05,
+      "loss": 0.6541,
+      "step": 8184
+    },
+    {
+      "epoch": 0.11862473285970081,
+      "grad_norm": 1.4482290744781494,
+      "learning_rate": 4.3714979580812355e-05,
+      "loss": 0.6563,
+      "step": 8215
+    },
+    {
+      "epoch": 0.11907237336105816,
+      "grad_norm": 1.1621575355529785,
+      "learning_rate": 4.365919182713416e-05,
+      "loss": 0.656,
+      "step": 8246
+    },
+    {
+      "epoch": 0.11952001386241552,
+      "grad_norm": 1.1643873453140259,
+      "learning_rate": 4.360319350701226e-05,
+      "loss": 0.6547,
+      "step": 8277
+    },
+    {
+      "epoch": 0.1199676543637729,
+      "grad_norm": 1.4016129970550537,
+      "learning_rate": 4.3546985252380115e-05,
+      "loss": 0.6582,
+      "step": 8308
+    },
+    {
+      "epoch": 0.12041529486513025,
+      "grad_norm": 1.4023685455322266,
+      "learning_rate": 4.349056769754021e-05,
+      "loss": 0.6621,
+      "step": 8339
+    },
+    {
+      "epoch": 0.12086293536648761,
+      "grad_norm": 1.3020285367965698,
+      "learning_rate": 4.3433941479156994e-05,
+      "loss": 0.6674,
+      "step": 8370
+    },
+    {
+      "epoch": 0.12131057586784497,
+      "grad_norm": 1.2162435054779053,
+      "learning_rate": 4.3377107236249647e-05,
+      "loss": 0.6614,
+      "step": 8401
+    },
+    {
+      "epoch": 0.12175821636920234,
+      "grad_norm": 1.1956969499588013,
+      "learning_rate": 4.332006561018488e-05,
+      "loss": 0.6557,
+      "step": 8432
+    },
+    {
+      "epoch": 0.1222058568705597,
+      "grad_norm": 1.1723664999008179,
+      "learning_rate": 4.3262817244669683e-05,
+      "loss": 0.6633,
+      "step": 8463
+    },
+    {
+      "epoch": 0.12265349737191705,
+      "grad_norm": 1.113020658493042,
+      "learning_rate": 4.3205362785744083e-05,
+      "loss": 0.6577,
+      "step": 8494
+    },
+    {
+      "epoch": 0.12310113787327442,
+      "grad_norm": 1.2453004121780396,
+      "learning_rate": 4.314770288177384e-05,
+      "loss": 0.6544,
+      "step": 8525
+    },
+    {
+      "epoch": 0.12354877837463178,
+      "grad_norm": 1.1493890285491943,
+      "learning_rate": 4.308983818344313e-05,
+      "loss": 0.6533,
+      "step": 8556
+    },
+    {
+      "epoch": 0.12399641887598914,
+      "grad_norm": 1.4172496795654297,
+      "learning_rate": 4.3031769343747206e-05,
+      "loss": 0.6542,
+      "step": 8587
+    },
+    {
+      "epoch": 0.1244440593773465,
+      "grad_norm": 1.1840728521347046,
+      "learning_rate": 4.297349701798505e-05,
+      "loss": 0.6476,
+      "step": 8618
+    },
+    {
+      "epoch": 0.12489169987870387,
+      "grad_norm": 1.3720282316207886,
+      "learning_rate": 4.2915021863751916e-05,
+      "loss": 0.6446,
+      "step": 8649
+    },
+    {
+      "epoch": 0.12533934038006123,
+      "grad_norm": 1.1705596446990967,
+      "learning_rate": 4.285634454093198e-05,
+      "loss": 0.6537,
+      "step": 8680
+    },
+    {
+      "epoch": 0.1257869808814186,
+      "grad_norm": 1.0790083408355713,
+      "learning_rate": 4.279746571169086e-05,
+      "loss": 0.6543,
+      "step": 8711
+    },
+    {
+      "epoch": 0.12623462138277594,
+      "grad_norm": 1.1207470893859863,
+      "learning_rate": 4.2738386040468136e-05,
+      "loss": 0.6468,
+      "step": 8742
+    },
+    {
+      "epoch": 0.1266822618841333,
+      "grad_norm": 1.1123065948486328,
+      "learning_rate": 4.2679106193969866e-05,
+      "loss": 0.6596,
+      "step": 8773
+    },
+    {
+      "epoch": 0.12712990238549068,
+      "grad_norm": 1.1579636335372925,
+      "learning_rate": 4.261962684116106e-05,
+      "loss": 0.6458,
+      "step": 8804
+    },
+    {
+      "epoch": 0.12757754288684803,
+      "grad_norm": 1.3112802505493164,
+      "learning_rate": 4.2559948653258145e-05,
+      "loss": 0.6483,
+      "step": 8835
+    },
+    {
+      "epoch": 0.1280251833882054,
+      "grad_norm": 1.1104832887649536,
+      "learning_rate": 4.250007230372134e-05,
+      "loss": 0.645,
+      "step": 8866
+    },
+    {
+      "epoch": 0.12847282388956274,
+      "grad_norm": 1.0218713283538818,
+      "learning_rate": 4.2439998468247126e-05,
+      "loss": 0.6519,
+      "step": 8897
+    },
+    {
+      "epoch": 0.12892046439092011,
+      "grad_norm": 1.0053678750991821,
+      "learning_rate": 4.2379727824760566e-05,
+      "loss": 0.6468,
+      "step": 8928
+    },
+    {
+      "epoch": 0.12936810489227749,
+      "grad_norm": 1.410933017730713,
+      "learning_rate": 4.231926105340768e-05,
+      "loss": 0.6573,
+      "step": 8959
+    },
+    {
+      "epoch": 0.12981574539363483,
+      "grad_norm": 1.5001798868179321,
+      "learning_rate": 4.225859883654776e-05,
+      "loss": 0.6483,
+      "step": 8990
+    },
+    {
+      "epoch": 0.1302633858949922,
+      "grad_norm": 1.112316608428955,
+      "learning_rate": 4.219774185874569e-05,
+      "loss": 0.6483,
+      "step": 9021
+    },
+    {
+      "epoch": 0.13071102639634957,
+      "grad_norm": 1.527464747428894,
+      "learning_rate": 4.213669080676418e-05,
+      "loss": 0.6512,
+      "step": 9052
+    },
+    {
+      "epoch": 0.13115866689770692,
+      "grad_norm": 1.1075704097747803,
+      "learning_rate": 4.2075446369556056e-05,
+      "loss": 0.6577,
+      "step": 9083
+    },
+    {
+      "epoch": 0.1316063073990643,
+      "grad_norm": 0.9589399099349976,
+      "learning_rate": 4.201400923825648e-05,
+      "loss": 0.642,
+      "step": 9114
+    },
+    {
+      "epoch": 0.13205394790042166,
+      "grad_norm": 1.186531901359558,
+      "learning_rate": 4.195238010617511e-05,
+      "loss": 0.6553,
+      "step": 9145
+    },
+    {
+      "epoch": 0.132501588401779,
+      "grad_norm": 1.1176280975341797,
+      "learning_rate": 4.1890559668788344e-05,
+      "loss": 0.6483,
+      "step": 9176
+    },
+    {
+      "epoch": 0.13294922890313637,
+      "grad_norm": 1.4222681522369385,
+      "learning_rate": 4.1828548623731405e-05,
+      "loss": 0.6462,
+      "step": 9207
+    },
+    {
+      "epoch": 0.13339686940449375,
+      "grad_norm": 1.1606040000915527,
+      "learning_rate": 4.1766347670790506e-05,
+      "loss": 0.6514,
+      "step": 9238
+    },
+    {
+      "epoch": 0.1338445099058511,
+      "grad_norm": 1.313774585723877,
+      "learning_rate": 4.170395751189495e-05,
+      "loss": 0.6422,
+      "step": 9269
+    },
+    {
+      "epoch": 0.13429215040720846,
+      "grad_norm": 1.1994171142578125,
+      "learning_rate": 4.164137885110921e-05,
+      "loss": 0.6444,
+      "step": 9300
+    },
+    {
+      "epoch": 0.1347397909085658,
+      "grad_norm": 0.9376353025436401,
+      "learning_rate": 4.157861239462495e-05,
+      "loss": 0.6436,
+      "step": 9331
+    },
+    {
+      "epoch": 0.13518743140992318,
+      "grad_norm": 1.0350178480148315,
+      "learning_rate": 4.1515658850753114e-05,
+      "loss": 0.6447,
+      "step": 9362
+    },
+    {
+      "epoch": 0.13563507191128055,
+      "grad_norm": 1.3630082607269287,
+      "learning_rate": 4.145251892991588e-05,
+      "loss": 0.6427,
+      "step": 9393
+    },
+    {
+      "epoch": 0.1360827124126379,
+      "grad_norm": 1.0362364053726196,
+      "learning_rate": 4.138919334463868e-05,
+      "loss": 0.6443,
+      "step": 9424
+    },
+    {
+      "epoch": 0.13653035291399526,
+      "grad_norm": 1.1442211866378784,
+      "learning_rate": 4.1325682809542124e-05,
+      "loss": 0.6523,
+      "step": 9455
+    },
+    {
+      "epoch": 0.13697799341535263,
+      "grad_norm": 1.4196938276290894,
+      "learning_rate": 4.126198804133398e-05,
+      "loss": 0.6501,
+      "step": 9486
+    },
+    {
+      "epoch": 0.13742563391670998,
+      "grad_norm": 1.3853130340576172,
+      "learning_rate": 4.1198109758801055e-05,
+      "loss": 0.6431,
+      "step": 9517
+    },
+    {
+      "epoch": 0.13787327441806735,
+      "grad_norm": 1.0350273847579956,
+      "learning_rate": 4.113404868280107e-05,
+      "loss": 0.6436,
+      "step": 9548
+    },
+    {
+      "epoch": 0.13832091491942472,
+      "grad_norm": 1.0520857572555542,
+      "learning_rate": 4.106980553625457e-05,
+      "loss": 0.6436,
+      "step": 9579
+    },
+    {
+      "epoch": 0.13876855542078206,
+      "grad_norm": 1.127038836479187,
+      "learning_rate": 4.100538104413674e-05,
+      "loss": 0.639,
+      "step": 9610
+    },
+    {
+      "epoch": 0.13921619592213944,
+      "grad_norm": 1.1070880889892578,
+      "learning_rate": 4.09407759334692e-05,
+      "loss": 0.6366,
+      "step": 9641
+    },
+    {
+      "epoch": 0.1396638364234968,
+      "grad_norm": 1.3045605421066284,
+      "learning_rate": 4.087599093331186e-05,
+      "loss": 0.6496,
+      "step": 9672
+    },
+    {
+      "epoch": 0.14011147692485415,
+      "grad_norm": 1.234647512435913,
+      "learning_rate": 4.081102677475462e-05,
+      "loss": 0.6383,
+      "step": 9703
+    },
+    {
+      "epoch": 0.14055911742621152,
+      "grad_norm": 1.1154453754425049,
+      "learning_rate": 4.0745884190909194e-05,
+      "loss": 0.6454,
+      "step": 9734
+    },
+    {
+      "epoch": 0.14100675792756887,
+      "grad_norm": 1.2422186136245728,
+      "learning_rate": 4.0680563916900796e-05,
+      "loss": 0.6404,
+      "step": 9765
+    },
+    {
+      "epoch": 0.14145439842892624,
+      "grad_norm": 1.2128278017044067,
+      "learning_rate": 4.0615066689859815e-05,
+      "loss": 0.6376,
+      "step": 9796
+    },
+    {
+      "epoch": 0.1419020389302836,
+      "grad_norm": 1.3140804767608643,
+      "learning_rate": 4.0549393248913584e-05,
+      "loss": 0.6316,
+      "step": 9827
+    },
+    {
+      "epoch": 0.14234967943164095,
+      "grad_norm": 1.9198187589645386,
+      "learning_rate": 4.048354433517794e-05,
+      "loss": 0.6383,
+      "step": 9858
+    },
+    {
+      "epoch": 0.14279731993299832,
+      "grad_norm": 1.144679307937622,
+      "learning_rate": 4.0417520691748916e-05,
+      "loss": 0.6383,
+      "step": 9889
+    },
+    {
+      "epoch": 0.1432449604343557,
+      "grad_norm": 1.1679338216781616,
+      "learning_rate": 4.035132306369438e-05,
+      "loss": 0.6414,
+      "step": 9920
+    },
+    {
+      "epoch": 0.14369260093571304,
+      "grad_norm": 0.9563717246055603,
+      "learning_rate": 4.028495219804555e-05,
+      "loss": 0.6327,
+      "step": 9951
+    },
+    {
+      "epoch": 0.1441402414370704,
+      "grad_norm": 1.277036428451538,
+      "learning_rate": 4.021840884378864e-05,
+      "loss": 0.6365,
+      "step": 9982
+    },
+    {
+      "epoch": 0.14458788193842778,
+      "grad_norm": 0.9835182428359985,
+      "learning_rate": 4.015169375185633e-05,
+      "loss": 0.638,
+      "step": 10013
+    },
+    {
+      "epoch": 0.14503552243978513,
+      "grad_norm": 1.090118646621704,
+      "learning_rate": 4.0084807675119396e-05,
+      "loss": 0.6437,
+      "step": 10044
+    },
+    {
+      "epoch": 0.1454831629411425,
+      "grad_norm": 1.1823488473892212,
+      "learning_rate": 4.0017751368378106e-05,
+      "loss": 0.6326,
+      "step": 10075
+    },
+    {
+      "epoch": 0.14593080344249987,
+      "grad_norm": 1.070318341255188,
+      "learning_rate": 3.995052558835377e-05,
+      "loss": 0.6362,
+      "step": 10106
+    },
+    {
+      "epoch": 0.1463784439438572,
+      "grad_norm": 1.2451491355895996,
+      "learning_rate": 3.988313109368017e-05,
+      "loss": 0.6388,
+      "step": 10137
+    },
+    {
+      "epoch": 0.14682608444521458,
+      "grad_norm": 1.2417365312576294,
+      "learning_rate": 3.981556864489504e-05,
+      "loss": 0.6309,
+      "step": 10168
+    },
+    {
+      "epoch": 0.14727372494657193,
+      "grad_norm": 1.251518726348877,
+      "learning_rate": 3.974783900443142e-05,
+      "loss": 0.6365,
+      "step": 10199
+    },
+    {
+      "epoch": 0.1477213654479293,
+      "grad_norm": 1.359750747680664,
+      "learning_rate": 3.9679942936609095e-05,
+      "loss": 0.6386,
+      "step": 10230
+    },
+    {
+      "epoch": 0.14816900594928667,
+      "grad_norm": 1.1073262691497803,
+      "learning_rate": 3.961188120762596e-05,
+      "loss": 0.635,
+      "step": 10261
+    },
+    {
+      "epoch": 0.148616646450644,
+      "grad_norm": 0.9850608706474304,
+      "learning_rate": 3.954365458554938e-05,
+      "loss": 0.6389,
+      "step": 10292
+    },
+    {
+      "epoch": 0.14906428695200138,
+      "grad_norm": 1.2984429597854614,
+      "learning_rate": 3.947526384030751e-05,
+      "loss": 0.6317,
+      "step": 10323
+    },
+    {
+      "epoch": 0.14951192745335876,
+      "grad_norm": 1.1622575521469116,
+      "learning_rate": 3.9406709743680624e-05,
+      "loss": 0.6393,
+      "step": 10354
+    },
+    {
+      "epoch": 0.1499595679547161,
+      "grad_norm": 1.0856871604919434,
+      "learning_rate": 3.9337993069292366e-05,
+      "loss": 0.6351,
+      "step": 10385
+    },
+    {
+      "epoch": 0.15040720845607347,
+      "grad_norm": 1.0153882503509521,
+      "learning_rate": 3.926911459260109e-05,
+      "loss": 0.6282,
+      "step": 10416
+    },
+    {
+      "epoch": 0.15085484895743084,
+      "grad_norm": 1.2039254903793335,
+      "learning_rate": 3.920007509089102e-05,
+      "loss": 0.6365,
+      "step": 10447
+    },
+    {
+      "epoch": 0.1513024894587882,
+      "grad_norm": 1.1179555654525757,
+      "learning_rate": 3.913087534326357e-05,
+      "loss": 0.6311,
+      "step": 10478
+    },
+    {
+      "epoch": 0.15175012996014556,
+      "grad_norm": 1.090903639793396,
+      "learning_rate": 3.9061516130628475e-05,
+      "loss": 0.6401,
+      "step": 10509
+    },
+    {
+      "epoch": 0.15219777046150293,
+      "grad_norm": 0.9228240251541138,
+      "learning_rate": 3.8991998235695025e-05,
+      "loss": 0.6323,
+      "step": 10540
+    },
+    {
+      "epoch": 0.15264541096286027,
+      "grad_norm": 1.0772743225097656,
+      "learning_rate": 3.8922322442963224e-05,
+      "loss": 0.637,
+      "step": 10571
+    },
+    {
+      "epoch": 0.15309305146421764,
+      "grad_norm": 1.0854771137237549,
+      "learning_rate": 3.885248953871491e-05,
+      "loss": 0.6375,
+      "step": 10602
+    },
+    {
+      "epoch": 0.153540691965575,
+      "grad_norm": 1.3902987241744995,
+      "learning_rate": 3.8782500311004915e-05,
+      "loss": 0.6406,
+      "step": 10633
+    },
+    {
+      "epoch": 0.15398833246693236,
+      "grad_norm": 1.180351734161377,
+      "learning_rate": 3.871235554965218e-05,
+      "loss": 0.626,
+      "step": 10664
+    },
+    {
+      "epoch": 0.15443597296828973,
+      "grad_norm": 1.1136449575424194,
+      "learning_rate": 3.864205604623078e-05,
+      "loss": 0.6395,
+      "step": 10695
+    },
+    {
+      "epoch": 0.15488361346964707,
+      "grad_norm": 1.1770708560943604,
+      "learning_rate": 3.857160259406107e-05,
+      "loss": 0.633,
+      "step": 10726
+    },
+    {
+      "epoch": 0.15533125397100445,
+      "grad_norm": 1.1615066528320312,
+      "learning_rate": 3.8500995988200674e-05,
+      "loss": 0.632,
+      "step": 10757
+    },
+    {
+      "epoch": 0.15577889447236182,
+      "grad_norm": 1.2898380756378174,
+      "learning_rate": 3.843023702543556e-05,
+      "loss": 0.6332,
+      "step": 10788
+    },
+    {
+      "epoch": 0.15622653497371916,
+      "grad_norm": 1.0051672458648682,
+      "learning_rate": 3.8359326504270984e-05,
+      "loss": 0.6353,
+      "step": 10819
+    },
+    {
+      "epoch": 0.15667417547507653,
+      "grad_norm": 0.9514272212982178,
+      "learning_rate": 3.828826522492255e-05,
+      "loss": 0.6383,
+      "step": 10850
+    },
+    {
+      "epoch": 0.1571218159764339,
+      "grad_norm": 1.2570873498916626,
+      "learning_rate": 3.821705398930713e-05,
+      "loss": 0.6308,
+      "step": 10881
+    },
+    {
+      "epoch": 0.15756945647779125,
+      "grad_norm": 0.992323637008667,
+      "learning_rate": 3.814569360103385e-05,
+      "loss": 0.6303,
+      "step": 10912
+    },
+    {
+      "epoch": 0.15801709697914862,
+      "grad_norm": 1.255265474319458,
+      "learning_rate": 3.807418486539499e-05,
+      "loss": 0.6349,
+      "step": 10943
+    },
+    {
+      "epoch": 0.158464737480506,
+      "grad_norm": 1.1066702604293823,
+      "learning_rate": 3.80025285893569e-05,
+      "loss": 0.6317,
+      "step": 10974
+    },
+    {
+      "epoch": 0.15891237798186333,
+      "grad_norm": 1.178690791130066,
+      "learning_rate": 3.793072558155093e-05,
+      "loss": 0.639,
+      "step": 11005
+    },
+    {
+      "epoch": 0.1593600184832207,
+      "grad_norm": 1.0850341320037842,
+      "learning_rate": 3.785877665226426e-05,
+      "loss": 0.6375,
+      "step": 11036
+    },
+    {
+      "epoch": 0.15980765898457805,
+      "grad_norm": 1.1378651857376099,
+      "learning_rate": 3.778668261343079e-05,
+      "loss": 0.6287,
+      "step": 11067
+    },
+    {
+      "epoch": 0.16025529948593542,
+      "grad_norm": 1.07688570022583,
+      "learning_rate": 3.771444427862192e-05,
+      "loss": 0.6261,
+      "step": 11098
+    },
+    {
+      "epoch": 0.1607029399872928,
+      "grad_norm": 1.108269453048706,
+      "learning_rate": 3.7642062463037465e-05,
+      "loss": 0.6352,
+      "step": 11129
+    },
+    {
+      "epoch": 0.16115058048865014,
+      "grad_norm": 1.2582095861434937,
+      "learning_rate": 3.7569537983496373e-05,
+      "loss": 0.6312,
+      "step": 11160
+    },
+    {
+      "epoch": 0.1615982209900075,
+      "grad_norm": 0.9823578000068665,
+      "learning_rate": 3.749687165842753e-05,
+      "loss": 0.6253,
+      "step": 11191
+    },
+    {
+      "epoch": 0.16204586149136488,
+      "grad_norm": 1.3922805786132812,
+      "learning_rate": 3.7424064307860536e-05,
+      "loss": 0.6279,
+      "step": 11222
+    },
+    {
+      "epoch": 0.16249350199272222,
+      "grad_norm": 1.2210962772369385,
+      "learning_rate": 3.735111675341645e-05,
+      "loss": 0.6357,
+      "step": 11253
+    },
+    {
+      "epoch": 0.1629411424940796,
+      "grad_norm": 1.0463316440582275,
+      "learning_rate": 3.7278029818298524e-05,
+      "loss": 0.6332,
+      "step": 11284
+    },
+    {
+      "epoch": 0.16338878299543697,
+      "grad_norm": 1.165583848953247,
+      "learning_rate": 3.720480432728287e-05,
+      "loss": 0.627,
+      "step": 11315
+    },
+    {
+      "epoch": 0.1638364234967943,
+      "grad_norm": 1.0995306968688965,
+      "learning_rate": 3.71314411067092e-05,
+      "loss": 0.6283,
+      "step": 11346
+    },
+    {
+      "epoch": 0.16428406399815168,
+      "grad_norm": 1.0279158353805542,
+      "learning_rate": 3.70579409844715e-05,
+      "loss": 0.6287,
+      "step": 11377
+    },
+    {
+      "epoch": 0.16473170449950905,
+      "grad_norm": 1.51092529296875,
+      "learning_rate": 3.698430479000865e-05,
+      "loss": 0.6261,
+      "step": 11408
+    },
+    {
+      "epoch": 0.1651793450008664,
+      "grad_norm": 1.020936369895935,
+      "learning_rate": 3.691053335429509e-05,
+      "loss": 0.6327,
+      "step": 11439
+    },
+    {
+      "epoch": 0.16562698550222377,
+      "grad_norm": 1.0198683738708496,
+      "learning_rate": 3.683662750983147e-05,
+      "loss": 0.6422,
+      "step": 11470
+    },
+    {
+      "epoch": 0.1660746260035811,
+      "grad_norm": 1.2650995254516602,
+      "learning_rate": 3.676258809063518e-05,
+      "loss": 0.6354,
+      "step": 11501
+    },
+    {
+      "epoch": 0.16652226650493848,
+      "grad_norm": 1.1653568744659424,
+      "learning_rate": 3.6688415932231004e-05,
+      "loss": 0.6325,
+      "step": 11532
+    },
+    {
+      "epoch": 0.16696990700629585,
+      "grad_norm": 1.1461430788040161,
+      "learning_rate": 3.661411187164166e-05,
+      "loss": 0.6251,
+      "step": 11563
+    },
+    {
+      "epoch": 0.1674175475076532,
+      "grad_norm": 1.2535974979400635,
+      "learning_rate": 3.65396767473784e-05,
+      "loss": 0.6245,
+      "step": 11594
+    },
+    {
+      "epoch": 0.16786518800901057,
+      "grad_norm": 1.115191102027893,
+      "learning_rate": 3.6465111399431465e-05,
+      "loss": 0.6294,
+      "step": 11625
+    },
+    {
+      "epoch": 0.16831282851036794,
+      "grad_norm": 1.0482964515686035,
+      "learning_rate": 3.6390416669260674e-05,
+      "loss": 0.6247,
+      "step": 11656
+    },
+    {
+      "epoch": 0.16876046901172528,
+      "grad_norm": 1.1431951522827148,
+      "learning_rate": 3.63155933997859e-05,
+      "loss": 0.63,
+      "step": 11687
+    },
+    {
+      "epoch": 0.16920810951308266,
+      "grad_norm": 1.0254175662994385,
+      "learning_rate": 3.624064243537758e-05,
+      "loss": 0.6212,
+      "step": 11718
+    },
+    {
+      "epoch": 0.16965575001444003,
+      "grad_norm": 0.9481080174446106,
+      "learning_rate": 3.616556462184716e-05,
+      "loss": 0.6278,
+      "step": 11749
+    },
+    {
+      "epoch": 0.17010339051579737,
+      "grad_norm": 1.118394374847412,
+      "learning_rate": 3.609036080643755e-05,
+      "loss": 0.6244,
+      "step": 11780
+    },
+    {
+      "epoch": 0.17055103101715474,
+      "grad_norm": 1.1592167615890503,
+      "learning_rate": 3.60150318378136e-05,
+      "loss": 0.621,
+      "step": 11811
+    },
+    {
+      "epoch": 0.1709986715185121,
+      "grad_norm": 0.9984686374664307,
+      "learning_rate": 3.5939578566052465e-05,
+      "loss": 0.6319,
+      "step": 11842
+    },
+    {
+      "epoch": 0.17144631201986946,
+      "grad_norm": 1.0091164112091064,
+      "learning_rate": 3.586400184263408e-05,
+      "loss": 0.6345,
+      "step": 11873
+    },
+    {
+      "epoch": 0.17189395252122683,
+      "grad_norm": 1.0355888605117798,
+      "learning_rate": 3.578830252043148e-05,
+      "loss": 0.6171,
+      "step": 11904
+    },
+    {
+      "epoch": 0.17234159302258417,
+      "grad_norm": 1.1437592506408691,
+      "learning_rate": 3.571248145370125e-05,
+      "loss": 0.6201,
+      "step": 11935
+    },
+    {
+      "epoch": 0.17278923352394154,
+      "grad_norm": 0.9440962672233582,
+      "learning_rate": 3.5636539498073794e-05,
+      "loss": 0.6236,
+      "step": 11966
+    },
+    {
+      "epoch": 0.17323687402529891,
+      "grad_norm": 0.9761082530021667,
+      "learning_rate": 3.556047751054378e-05,
+      "loss": 0.6291,
+      "step": 11997
+    },
+    {
+      "epoch": 0.17368451452665626,
+      "grad_norm": 1.1858127117156982,
+      "learning_rate": 3.548429634946039e-05,
+      "loss": 0.6299,
+      "step": 12028
+    },
+    {
+      "epoch": 0.17413215502801363,
+      "grad_norm": 1.0180195569992065,
+      "learning_rate": 3.540799687451768e-05,
+      "loss": 0.6227,
+      "step": 12059
+    },
+    {
+      "epoch": 0.174579795529371,
+      "grad_norm": 0.9683852195739746,
+      "learning_rate": 3.533157994674485e-05,
+      "loss": 0.626,
+      "step": 12090
+    },
+    {
+      "epoch": 0.17502743603072834,
+      "grad_norm": 1.0338289737701416,
+      "learning_rate": 3.5255046428496546e-05,
+      "loss": 0.6377,
+      "step": 12121
+    },
+    {
+      "epoch": 0.17547507653208572,
+      "grad_norm": 1.1238298416137695,
+      "learning_rate": 3.517839718344311e-05,
+      "loss": 0.6338,
+      "step": 12152
+    },
+    {
+      "epoch": 0.1759227170334431,
+      "grad_norm": 1.0541973114013672,
+      "learning_rate": 3.510163307656086e-05,
+      "loss": 0.6222,
+      "step": 12183
+    },
+    {
+      "epoch": 0.17637035753480043,
+      "grad_norm": 1.1677592992782593,
+      "learning_rate": 3.5024754974122324e-05,
+      "loss": 0.6256,
+      "step": 12214
+    },
+    {
+      "epoch": 0.1768179980361578,
+      "grad_norm": 1.0191985368728638,
+      "learning_rate": 3.494776374368643e-05,
+      "loss": 0.6301,
+      "step": 12245
+    },
+    {
+      "epoch": 0.17726563853751517,
+      "grad_norm": 1.1535918712615967,
+      "learning_rate": 3.4870660254088724e-05,
+      "loss": 0.6253,
+      "step": 12276
+    },
+    {
+      "epoch": 0.17771327903887252,
+      "grad_norm": 1.0887985229492188,
+      "learning_rate": 3.479344537543164e-05,
+      "loss": 0.6335,
+      "step": 12307
+    },
+    {
+      "epoch": 0.1781609195402299,
+      "grad_norm": 1.010688066482544,
+      "learning_rate": 3.4716119979074565e-05,
+      "loss": 0.6251,
+      "step": 12338
+    },
+    {
+      "epoch": 0.17860856004158723,
+      "grad_norm": 0.9745127558708191,
+      "learning_rate": 3.463868493762412e-05,
+      "loss": 0.6241,
+      "step": 12369
+    },
+    {
+      "epoch": 0.1790562005429446,
+      "grad_norm": 1.0414716005325317,
+      "learning_rate": 3.456114112492418e-05,
+      "loss": 0.6237,
+      "step": 12400
+    },
+    {
+      "epoch": 0.17950384104430198,
+      "grad_norm": 1.0457465648651123,
+      "learning_rate": 3.4483489416046164e-05,
+      "loss": 0.6258,
+      "step": 12431
+    },
+    {
+      "epoch": 0.17995148154565932,
+      "grad_norm": 1.0389049053192139,
+      "learning_rate": 3.440573068727905e-05,
+      "loss": 0.6262,
+      "step": 12462
+    },
+    {
+      "epoch": 0.1803991220470167,
+      "grad_norm": 1.255600094795227,
+      "learning_rate": 3.4327865816119495e-05,
+      "loss": 0.6305,
+      "step": 12493
+    },
+    {
+      "epoch": 0.18084676254837406,
+      "grad_norm": 1.0340358018875122,
+      "learning_rate": 3.4249895681262025e-05,
+      "loss": 0.6212,
+      "step": 12524
+    },
+    {
+      "epoch": 0.1812944030497314,
+      "grad_norm": 1.0317034721374512,
+      "learning_rate": 3.417182116258899e-05,
+      "loss": 0.6279,
+      "step": 12555
+    },
+    {
+      "epoch": 0.18174204355108878,
+      "grad_norm": 1.1320221424102783,
+      "learning_rate": 3.409364314116074e-05,
+      "loss": 0.631,
+      "step": 12586
+    },
+    {
+      "epoch": 0.18218968405244615,
+      "grad_norm": 0.9674787521362305,
+      "learning_rate": 3.401536249920559e-05,
+      "loss": 0.627,
+      "step": 12617
+    },
+    {
+      "epoch": 0.1826373245538035,
+      "grad_norm": 0.9329623579978943,
+      "learning_rate": 3.393698012010998e-05,
+      "loss": 0.6244,
+      "step": 12648
+    },
+    {
+      "epoch": 0.18308496505516086,
+      "grad_norm": 1.2081501483917236,
+      "learning_rate": 3.385849688840839e-05,
+      "loss": 0.6295,
+      "step": 12679
+    },
+    {
+      "epoch": 0.18353260555651824,
+      "grad_norm": 0.9842090010643005,
+      "learning_rate": 3.3779913689773414e-05,
+      "loss": 0.6276,
+      "step": 12710
+    },
+    {
+      "epoch": 0.18398024605787558,
+      "grad_norm": 1.1417752504348755,
+      "learning_rate": 3.370123141100578e-05,
+      "loss": 0.6266,
+      "step": 12741
+    },
+    {
+      "epoch": 0.18442788655923295,
+      "grad_norm": 0.9693592190742493,
+      "learning_rate": 3.3622450940024305e-05,
+      "loss": 0.6245,
+      "step": 12772
+    },
+    {
+      "epoch": 0.1848755270605903,
+      "grad_norm": 1.1920111179351807,
+      "learning_rate": 3.35435731658559e-05,
+      "loss": 0.6227,
+      "step": 12803
+    },
+    {
+      "epoch": 0.18532316756194767,
+      "grad_norm": 0.9865401387214661,
+      "learning_rate": 3.346459897862552e-05,
+      "loss": 0.6287,
+      "step": 12834
+    },
+    {
+      "epoch": 0.18577080806330504,
+      "grad_norm": 0.9544184803962708,
+      "learning_rate": 3.338552926954613e-05,
+      "loss": 0.6236,
+      "step": 12865
+    },
+    {
+      "epoch": 0.18621844856466238,
+      "grad_norm": 1.0202548503875732,
+      "learning_rate": 3.330636493090868e-05,
+      "loss": 0.6269,
+      "step": 12896
+    },
+    {
+      "epoch": 0.18666608906601975,
+      "grad_norm": 1.1385433673858643,
+      "learning_rate": 3.322710685607193e-05,
+      "loss": 0.6385,
+      "step": 12927
+    },
+    {
+      "epoch": 0.18711372956737712,
+      "grad_norm": 1.0102901458740234,
+      "learning_rate": 3.314775593945251e-05,
+      "loss": 0.6241,
+      "step": 12958
+    },
+    {
+      "epoch": 0.18756137006873447,
+      "grad_norm": 0.9830989241600037,
+      "learning_rate": 3.3068313076514714e-05,
+      "loss": 0.6243,
+      "step": 12989
+    },
+    {
+      "epoch": 0.18800901057009184,
+      "grad_norm": 1.0044376850128174,
+      "learning_rate": 3.298877916376047e-05,
+      "loss": 0.619,
+      "step": 13020
+    },
+    {
+      "epoch": 0.1884566510714492,
+      "grad_norm": 1.0714712142944336,
+      "learning_rate": 3.290915509871915e-05,
+      "loss": 0.6243,
+      "step": 13051
+    },
+    {
+      "epoch": 0.18890429157280655,
+      "grad_norm": 0.9379229545593262,
+      "learning_rate": 3.282944177993753e-05,
+      "loss": 0.6216,
+      "step": 13082
+    },
+    {
+      "epoch": 0.18935193207416393,
+      "grad_norm": 1.2717514038085938,
+      "learning_rate": 3.274964010696957e-05,
+      "loss": 0.6206,
+      "step": 13113
+    },
+    {
+      "epoch": 0.1897995725755213,
+      "grad_norm": 1.1147576570510864,
+      "learning_rate": 3.266975098036629e-05,
+      "loss": 0.6234,
+      "step": 13144
+    },
+    {
+      "epoch": 0.19024721307687864,
+      "grad_norm": 0.9994730949401855,
+      "learning_rate": 3.258977530166562e-05,
+      "loss": 0.6146,
+      "step": 13175
+    },
+    {
+      "epoch": 0.190694853578236,
+      "grad_norm": 1.195367693901062,
+      "learning_rate": 3.250971397338227e-05,
+      "loss": 0.624,
+      "step": 13206
+    },
+    {
+      "epoch": 0.19114249407959336,
+      "grad_norm": 1.0008747577667236,
+      "learning_rate": 3.2429567898997404e-05,
+      "loss": 0.6182,
+      "step": 13237
+    },
+    {
+      "epoch": 0.19159013458095073,
+      "grad_norm": 1.3223299980163574,
+      "learning_rate": 3.234933798294859e-05,
+      "loss": 0.6193,
+      "step": 13268
+    },
+    {
+      "epoch": 0.1920377750823081,
+      "grad_norm": 1.1946437358856201,
+      "learning_rate": 3.2269025130619535e-05,
+      "loss": 0.6201,
+      "step": 13299
+    },
+    {
+      "epoch": 0.19248541558366544,
+      "grad_norm": 1.1597986221313477,
+      "learning_rate": 3.218863024832985e-05,
+      "loss": 0.6212,
+      "step": 13330
+    },
+    {
+      "epoch": 0.1929330560850228,
+      "grad_norm": 0.9518936276435852,
+      "learning_rate": 3.2108154243324864e-05,
+      "loss": 0.6154,
+      "step": 13361
+    },
+    {
+      "epoch": 0.19338069658638019,
+      "grad_norm": 0.890487790107727,
+      "learning_rate": 3.2027598023765345e-05,
+      "loss": 0.6203,
+      "step": 13392
+    },
+    {
+      "epoch": 0.19382833708773753,
+      "grad_norm": 0.9918534755706787,
+      "learning_rate": 3.194696249871729e-05,
+      "loss": 0.6319,
+      "step": 13423
+    },
+    {
+      "epoch": 0.1942759775890949,
+      "grad_norm": 1.1954073905944824,
+      "learning_rate": 3.186624857814164e-05,
+      "loss": 0.619,
+      "step": 13454
+    },
+    {
+      "epoch": 0.19472361809045227,
+      "grad_norm": 1.1521157026290894,
+      "learning_rate": 3.178545717288401e-05,
+      "loss": 0.6326,
+      "step": 13485
+    },
+    {
+      "epoch": 0.19517125859180962,
+      "grad_norm": 1.0131208896636963,
+      "learning_rate": 3.170458919466444e-05,
+      "loss": 0.6234,
+      "step": 13516
+    },
+    {
+      "epoch": 0.195618899093167,
+      "grad_norm": 1.0429494380950928,
+      "learning_rate": 3.1623645556067063e-05,
+      "loss": 0.6146,
+      "step": 13547
+    },
+    {
+      "epoch": 0.19606653959452436,
+      "grad_norm": 0.9586461782455444,
+      "learning_rate": 3.154262717052985e-05,
+      "loss": 0.6192,
+      "step": 13578
+    },
+    {
+      "epoch": 0.1965141800958817,
+      "grad_norm": 0.9385515451431274,
+      "learning_rate": 3.146153495233426e-05,
+      "loss": 0.6186,
+      "step": 13609
+    },
+    {
+      "epoch": 0.19696182059723907,
+      "grad_norm": 0.9109722375869751,
+      "learning_rate": 3.1380369816594944e-05,
+      "loss": 0.6223,
+      "step": 13640
+    },
+    {
+      "epoch": 0.19740946109859642,
+      "grad_norm": 1.0564444065093994,
+      "learning_rate": 3.129913267924946e-05,
+      "loss": 0.6235,
+      "step": 13671
+    },
+    {
+      "epoch": 0.1978571015999538,
+      "grad_norm": 1.1656286716461182,
+      "learning_rate": 3.121782445704782e-05,
+      "loss": 0.6176,
+      "step": 13702
+    },
+    {
+      "epoch": 0.19830474210131116,
+      "grad_norm": 1.1301069259643555,
+      "learning_rate": 3.11364460675423e-05,
+      "loss": 0.6253,
+      "step": 13733
+    },
+    {
+      "epoch": 0.1987523826026685,
+      "grad_norm": 0.9939395785331726,
+      "learning_rate": 3.1054998429076934e-05,
+      "loss": 0.6223,
+      "step": 13764
+    },
+    {
+      "epoch": 0.19920002310402588,
+      "grad_norm": 1.2881885766983032,
+      "learning_rate": 3.097348246077728e-05,
+      "loss": 0.6177,
+      "step": 13795
+    },
+    {
+      "epoch": 0.19964766360538325,
+      "grad_norm": 1.1002579927444458,
+      "learning_rate": 3.0891899082539924e-05,
+      "loss": 0.6139,
+      "step": 13826
+    },
+    {
+      "epoch": 0.2000953041067406,
+      "grad_norm": 1.045394778251648,
+      "learning_rate": 3.0810249215022233e-05,
+      "loss": 0.6192,
+      "step": 13857
+    },
+    {
+      "epoch": 0.20054294460809796,
+      "grad_norm": 0.9559116959571838,
+      "learning_rate": 3.0728533779631865e-05,
+      "loss": 0.6155,
+      "step": 13888
+    },
+    {
+      "epoch": 0.20099058510945533,
+      "grad_norm": 0.9250887036323547,
+      "learning_rate": 3.064675369851637e-05,
+      "loss": 0.6235,
+      "step": 13919
+    },
+    {
+      "epoch": 0.20143822561081268,
+      "grad_norm": 1.0655368566513062,
+      "learning_rate": 3.056490989455289e-05,
+      "loss": 0.628,
+      "step": 13950
+    },
+    {
+      "epoch": 0.20188586611217005,
+      "grad_norm": 1.07636559009552,
+      "learning_rate": 3.0483003291337596e-05,
+      "loss": 0.6244,
+      "step": 13981
+    },
+    {
+      "epoch": 0.20233350661352742,
+      "grad_norm": 1.050580620765686,
+      "learning_rate": 3.040103481317539e-05,
+      "loss": 0.6222,
+      "step": 14012
+    },
+    {
+      "epoch": 0.20278114711488476,
+      "grad_norm": 1.3754404783248901,
+      "learning_rate": 3.03190053850694e-05,
+      "loss": 0.6151,
+      "step": 14043
+    },
+    {
+      "epoch": 0.20322878761624213,
+      "grad_norm": 1.0527547597885132,
+      "learning_rate": 3.0236915932710573e-05,
+      "loss": 0.6153,
+      "step": 14074
+    },
+    {
+      "epoch": 0.20367642811759948,
+      "grad_norm": 0.9438226819038391,
+      "learning_rate": 3.0154767382467232e-05,
+      "loss": 0.618,
+      "step": 14105
+    },
+    {
+      "epoch": 0.20412406861895685,
+      "grad_norm": 1.0383126735687256,
+      "learning_rate": 3.0072560661374582e-05,
+      "loss": 0.6162,
+      "step": 14136
+    },
+    {
+      "epoch": 0.20457170912031422,
+      "grad_norm": 1.1412239074707031,
+      "learning_rate": 2.999029669712431e-05,
+      "loss": 0.6284,
+      "step": 14167
+    },
+    {
+      "epoch": 0.20501934962167156,
+      "grad_norm": 1.1064159870147705,
+      "learning_rate": 2.990797641805408e-05,
+      "loss": 0.6223,
+      "step": 14198
+    },
+    {
+      "epoch": 0.20546699012302894,
+      "grad_norm": 1.0044069290161133,
+      "learning_rate": 2.982560075313704e-05,
+      "loss": 0.6191,
+      "step": 14229
+    },
+    {
+      "epoch": 0.2059146306243863,
+      "grad_norm": 0.9315604567527771,
+      "learning_rate": 2.9743170631971368e-05,
+      "loss": 0.6207,
+      "step": 14260
+    },
+    {
+      "epoch": 0.20636227112574365,
+      "grad_norm": 0.941224217414856,
+      "learning_rate": 2.9660686984769792e-05,
+      "loss": 0.6207,
+      "step": 14291
+    },
+    {
+      "epoch": 0.20680991162710102,
+      "grad_norm": 1.1239089965820312,
+      "learning_rate": 2.9578150742349047e-05,
+      "loss": 0.6252,
+      "step": 14322
+    },
+    {
+      "epoch": 0.2072575521284584,
+      "grad_norm": 0.9484926462173462,
+      "learning_rate": 2.949556283611942e-05,
+      "loss": 0.6136,
+      "step": 14353
+    },
+    {
+      "epoch": 0.20770519262981574,
+      "grad_norm": 0.9437084197998047,
+      "learning_rate": 2.9412924198074206e-05,
+      "loss": 0.6154,
+      "step": 14384
+    },
+    {
+      "epoch": 0.2081528331311731,
+      "grad_norm": 0.9578093886375427,
+      "learning_rate": 2.9330235760779208e-05,
+      "loss": 0.6191,
+      "step": 14415
+    },
+    {
+      "epoch": 0.20860047363253048,
+      "grad_norm": 1.0657248497009277,
+      "learning_rate": 2.9247498457362188e-05,
+      "loss": 0.6178,
+      "step": 14446
+    },
+    {
+      "epoch": 0.20904811413388782,
+      "grad_norm": 0.853568434715271,
+      "learning_rate": 2.9164713221502373e-05,
+      "loss": 0.6152,
+      "step": 14477
+    },
+    {
+      "epoch": 0.2094957546352452,
+      "grad_norm": 1.0403015613555908,
+      "learning_rate": 2.9081880987419912e-05,
+      "loss": 0.6108,
+      "step": 14508
+    },
+    {
+      "epoch": 0.20994339513660254,
+      "grad_norm": 1.0344171524047852,
+      "learning_rate": 2.8999002689865296e-05,
+      "loss": 0.6155,
+      "step": 14539
+    },
+    {
+      "epoch": 0.2103910356379599,
+      "grad_norm": 1.0755060911178589,
+      "learning_rate": 2.8916079264108852e-05,
+      "loss": 0.6156,
+      "step": 14570
+    },
+    {
+      "epoch": 0.21083867613931728,
+      "grad_norm": 0.8636776208877563,
+      "learning_rate": 2.883311164593017e-05,
+      "loss": 0.6193,
+      "step": 14601
+    },
+    {
+      "epoch": 0.21128631664067463,
+      "grad_norm": 1.0264644622802734,
+      "learning_rate": 2.875010077160754e-05,
+      "loss": 0.6138,
+      "step": 14632
+    },
+    {
+      "epoch": 0.211733957142032,
+      "grad_norm": 1.2590196132659912,
+      "learning_rate": 2.866704757790741e-05,
+      "loss": 0.6202,
+      "step": 14663
+    },
+    {
+      "epoch": 0.21218159764338937,
+      "grad_norm": 1.1028645038604736,
+      "learning_rate": 2.858395300207376e-05,
+      "loss": 0.614,
+      "step": 14694
+    },
+    {
+      "epoch": 0.2126292381447467,
+      "grad_norm": 0.8904405236244202,
+      "learning_rate": 2.8500817981817607e-05,
+      "loss": 0.6152,
+      "step": 14725
+    },
+    {
+      "epoch": 0.21307687864610408,
+      "grad_norm": 0.9810163974761963,
+      "learning_rate": 2.8417643455306336e-05,
+      "loss": 0.6088,
+      "step": 14756
+    },
+    {
+      "epoch": 0.21352451914746146,
+      "grad_norm": 0.9837898015975952,
+      "learning_rate": 2.8334430361153185e-05,
+      "loss": 0.6129,
+      "step": 14787
+    },
+    {
+      "epoch": 0.2139721596488188,
+      "grad_norm": 0.987639844417572,
+      "learning_rate": 2.8251179638406612e-05,
+      "loss": 0.6081,
+      "step": 14818
+    },
+    {
+      "epoch": 0.21441980015017617,
+      "grad_norm": 1.1478586196899414,
+      "learning_rate": 2.8167892226539704e-05,
+      "loss": 0.6146,
+      "step": 14849
+    },
+    {
+      "epoch": 0.21486744065153354,
+      "grad_norm": 1.0885242223739624,
+      "learning_rate": 2.8084569065439588e-05,
+      "loss": 0.6183,
+      "step": 14880
+    },
+    {
+      "epoch": 0.21531508115289089,
+      "grad_norm": 0.9934699535369873,
+      "learning_rate": 2.8001211095396807e-05,
+      "loss": 0.6157,
+      "step": 14911
+    },
+    {
+      "epoch": 0.21576272165424826,
+      "grad_norm": 0.9285492300987244,
+      "learning_rate": 2.791781925709473e-05,
+      "loss": 0.6196,
+      "step": 14942
+    },
+    {
+      "epoch": 0.2162103621556056,
+      "grad_norm": 1.243133783340454,
+      "learning_rate": 2.7834394491598908e-05,
+      "loss": 0.6109,
+      "step": 14973
+    },
+    {
+      "epoch": 0.21665800265696297,
+      "grad_norm": 1.0712559223175049,
+      "learning_rate": 2.7750937740346485e-05,
+      "loss": 0.6268,
+      "step": 15004
+    },
+    {
+      "epoch": 0.21710564315832034,
+      "grad_norm": 1.0762903690338135,
+      "learning_rate": 2.7667449945135564e-05,
+      "loss": 0.6162,
+      "step": 15035
+    },
+    {
+      "epoch": 0.2175532836596777,
+      "grad_norm": 1.043479084968567,
+      "learning_rate": 2.7583932048114557e-05,
+      "loss": 0.6174,
+      "step": 15066
+    },
+    {
+      "epoch": 0.21800092416103506,
+      "grad_norm": 0.9906991720199585,
+      "learning_rate": 2.7500384991771587e-05,
+      "loss": 0.6153,
+      "step": 15097
+    },
+    {
+      "epoch": 0.21844856466239243,
+      "grad_norm": 0.8844815492630005,
+      "learning_rate": 2.7416809718923825e-05,
+      "loss": 0.6113,
+      "step": 15128
+    },
+    {
+      "epoch": 0.21889620516374977,
+      "grad_norm": 1.0258604288101196,
+      "learning_rate": 2.7333207172706864e-05,
+      "loss": 0.6111,
+      "step": 15159
+    },
+    {
+      "epoch": 0.21934384566510715,
+      "grad_norm": 0.8992047309875488,
+      "learning_rate": 2.7249578296564088e-05,
+      "loss": 0.6083,
+      "step": 15190
+    },
+    {
+      "epoch": 0.21979148616646452,
+      "grad_norm": 0.991061806678772,
+      "learning_rate": 2.7165924034235973e-05,
+      "loss": 0.6219,
+      "step": 15221
+    },
+    {
+      "epoch": 0.22023912666782186,
+      "grad_norm": 0.9700108766555786,
+      "learning_rate": 2.708224532974953e-05,
+      "loss": 0.6119,
+      "step": 15252
+    }
+  ],
+  "logging_steps": 31,
+  "max_steps": 30517,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 7630,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1258280394232955e+19,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-15260/training_args.bin b/checkpoint-15260/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542
--- /dev/null
+++ b/checkpoint-15260/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3
+size 5432
diff --git a/checkpoint-22890/config.json b/checkpoint-22890/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09
--- /dev/null
+++ b/checkpoint-22890/config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "meta-llama/Llama-3.1-8B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/checkpoint-22890/generation_config.json b/checkpoint-22890/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507
--- /dev/null
+++ b/checkpoint-22890/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.47.0.dev0"
+}
diff --git a/checkpoint-22890/model-00001-of-00007.safetensors b/checkpoint-22890/model-00001-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..481f5f0eceeecb729de47911bc850cda733bf744
--- /dev/null
+++ b/checkpoint-22890/model-00001-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a65a75b982671f6fe429d8b4f2a643bc8cc123b81a8b3e23954d5f9dd4dfe741
+size 4886466168
diff --git a/checkpoint-22890/model-00002-of-00007.safetensors b/checkpoint-22890/model-00002-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961
--- /dev/null
+++ b/checkpoint-22890/model-00002-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64
+size 4832007448
diff --git a/checkpoint-22890/model-00003-of-00007.safetensors b/checkpoint-22890/model-00003-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff
--- /dev/null
+++ b/checkpoint-22890/model-00003-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97
+size 4999813112
diff --git a/checkpoint-22890/model-00004-of-00007.safetensors b/checkpoint-22890/model-00004-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a
--- /dev/null
+++ b/checkpoint-22890/model-00004-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042
+size 4999813128
diff --git a/checkpoint-22890/model-00005-of-00007.safetensors b/checkpoint-22890/model-00005-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89
--- /dev/null
+++ b/checkpoint-22890/model-00005-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7
+size 4832007496
diff --git a/checkpoint-22890/model-00006-of-00007.safetensors b/checkpoint-22890/model-00006-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6857e095b693a515a6b452b2978d8ec6beb1914d
--- /dev/null
+++ b/checkpoint-22890/model-00006-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1428de016ad23aaf1f31a13925e31fb9295785a7a8a83467da982e10e32ab7f1
+size 4999813120
diff --git a/checkpoint-22890/model-00007-of-00007.safetensors b/checkpoint-22890/model-00007-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..28d49af3d02ee8b841af095b31254b0104cf488e
--- /dev/null
+++ b/checkpoint-22890/model-00007-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b44e4ad77bf2ef84e92e62da86ec1c56c5090c09e1194074f9b844107d09805
+size 2571158184
diff --git a/checkpoint-22890/model.safetensors.index.json b/checkpoint-22890/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13
--- /dev/null
+++ b/checkpoint-22890/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 32121044992
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.norm.weight": "model-00007-of-00007.safetensors"
+  }
+}
diff --git a/checkpoint-22890/optimizer.pt b/checkpoint-22890/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..debe2927cf1c50506929531bbec61e776094d63c
--- /dev/null
+++ b/checkpoint-22890/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a7f0e296306f8b81a363f3e59074fa525d4d241d146378e78b07b98214f8e1d
+size 15385036334
diff --git a/checkpoint-22890/rng_state.pth b/checkpoint-22890/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c
--- /dev/null
+++ b/checkpoint-22890/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244
diff --git a/checkpoint-22890/scheduler.pt b/checkpoint-22890/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52df82b086a98dd5b315cd1cef2e8d75fddc67aa
--- /dev/null
+++ b/checkpoint-22890/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cceae98d85860224f83926a9c2d0797a9f4411abc8a933e66f23226e9ba794a
+size 1064
diff --git a/checkpoint-22890/trainer_state.json b/checkpoint-22890/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e4d09807aa8783e017c749e1097c4302742da03a
--- /dev/null
+++ b/checkpoint-22890/trainer_state.json
@@ -0,0 +1,5199 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3305319701958066,
+  "eval_steps": 500,
+  "global_step": 22890,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0004476405013573615,
+      "grad_norm": 4.6696085929870605,
+      "learning_rate": 1.0157273918741808e-06,
+      "loss": 0.9366,
+      "step": 31
+    },
+    {
+      "epoch": 0.000895281002714723,
+      "grad_norm": 4.250915050506592,
+      "learning_rate": 2.0314547837483616e-06,
+      "loss": 0.9002,
+      "step": 62
+    },
+    {
+      "epoch": 0.0013429215040720846,
+      "grad_norm": 4.424270153045654,
+      "learning_rate": 3.0471821756225426e-06,
+      "loss": 0.8843,
+      "step": 93
+    },
+    {
+      "epoch": 0.001790562005429446,
+      "grad_norm": 4.56964635848999,
+      "learning_rate": 4.062909567496723e-06,
+      "loss": 0.8717,
+      "step": 124
+    },
+    {
+      "epoch": 0.0022382025067868077,
+      "grad_norm": 4.051624298095703,
+      "learning_rate": 5.078636959370905e-06,
+      "loss": 0.8711,
+      "step": 155
+    },
+    {
+      "epoch": 0.002685843008144169,
+      "grad_norm": 3.98006272315979,
+      "learning_rate": 6.094364351245085e-06,
+      "loss": 0.8628,
+      "step": 186
+    },
+    {
+      "epoch": 0.0031334835095015307,
+      "grad_norm": 4.4158406257629395,
+      "learning_rate": 7.110091743119267e-06,
+      "loss": 0.871,
+      "step": 217
+    },
+    {
+      "epoch": 0.003581124010858892,
+      "grad_norm": 4.681333541870117,
+      "learning_rate": 8.125819134993446e-06,
+      "loss": 0.8593,
+      "step": 248
+    },
+    {
+      "epoch": 0.004028764512216254,
+      "grad_norm": 3.8057820796966553,
+      "learning_rate": 9.141546526867629e-06,
+      "loss": 0.8558,
+      "step": 279
+    },
+    {
+      "epoch": 0.0044764050135736155,
+      "grad_norm": 4.523633003234863,
+      "learning_rate": 1.015727391874181e-05,
+      "loss": 0.8676,
+      "step": 310
+    },
+    {
+      "epoch": 0.0049240455149309765,
+      "grad_norm": 3.7387187480926514,
+      "learning_rate": 1.117300131061599e-05,
+      "loss": 0.8585,
+      "step": 341
+    },
+    {
+      "epoch": 0.005371686016288338,
+      "grad_norm": 4.187750816345215,
+      "learning_rate": 1.218872870249017e-05,
+      "loss": 0.8592,
+      "step": 372
+    },
+    {
+      "epoch": 0.005819326517645699,
+      "grad_norm": 3.782883644104004,
+      "learning_rate": 1.3204456094364351e-05,
+      "loss": 0.8449,
+      "step": 403
+    },
+    {
+      "epoch": 0.006266967019003061,
+      "grad_norm": 3.577796459197998,
+      "learning_rate": 1.4220183486238533e-05,
+      "loss": 0.8418,
+      "step": 434
+    },
+    {
+      "epoch": 0.006714607520360423,
+      "grad_norm": 3.1408321857452393,
+      "learning_rate": 1.5235910878112714e-05,
+      "loss": 0.8577,
+      "step": 465
+    },
+    {
+      "epoch": 0.007162248021717784,
+      "grad_norm": 4.090081691741943,
+      "learning_rate": 1.6251638269986893e-05,
+      "loss": 0.8439,
+      "step": 496
+    },
+    {
+      "epoch": 0.007609888523075146,
+      "grad_norm": 2.7458200454711914,
+      "learning_rate": 1.7267365661861077e-05,
+      "loss": 0.8468,
+      "step": 527
+    },
+    {
+      "epoch": 0.008057529024432507,
+      "grad_norm": 3.703225612640381,
+      "learning_rate": 1.8283093053735257e-05,
+      "loss": 0.8385,
+      "step": 558
+    },
+    {
+      "epoch": 0.008505169525789868,
+      "grad_norm": 3.134650230407715,
+      "learning_rate": 1.9298820445609438e-05,
+      "loss": 0.8418,
+      "step": 589
+    },
+    {
+      "epoch": 0.008952810027147231,
+      "grad_norm": 3.762680768966675,
+      "learning_rate": 2.031454783748362e-05,
+      "loss": 0.8312,
+      "step": 620
+    },
+    {
+      "epoch": 0.009400450528504592,
+      "grad_norm": 3.751004457473755,
+      "learning_rate": 2.13302752293578e-05,
+      "loss": 0.8251,
+      "step": 651
+    },
+    {
+      "epoch": 0.009848091029861953,
+      "grad_norm": 3.2268712520599365,
+      "learning_rate": 2.234600262123198e-05,
+      "loss": 0.8369,
+      "step": 682
+    },
+    {
+      "epoch": 0.010295731531219316,
+      "grad_norm": 3.5854289531707764,
+      "learning_rate": 2.336173001310616e-05,
+      "loss": 0.826,
+      "step": 713
+    },
+    {
+      "epoch": 0.010743372032576677,
+      "grad_norm": 3.9910435676574707,
+      "learning_rate": 2.437745740498034e-05,
+      "loss": 0.8168,
+      "step": 744
+    },
+    {
+      "epoch": 0.011191012533934038,
+      "grad_norm": 3.3059303760528564,
+      "learning_rate": 2.5393184796854525e-05,
+      "loss": 0.8269,
+      "step": 775
+    },
+    {
+      "epoch": 0.011638653035291399,
+      "grad_norm": 3.4081811904907227,
+      "learning_rate": 2.6408912188728702e-05,
+      "loss": 0.817,
+      "step": 806
+    },
+    {
+      "epoch": 0.012086293536648762,
+      "grad_norm": 3.2740163803100586,
+      "learning_rate": 2.7424639580602886e-05,
+      "loss": 0.8195,
+      "step": 837
+    },
+    {
+      "epoch": 0.012533934038006123,
+      "grad_norm": 2.7206223011016846,
+      "learning_rate": 2.8440366972477066e-05,
+      "loss": 0.8188,
+      "step": 868
+    },
+    {
+      "epoch": 0.012981574539363484,
+      "grad_norm": 2.7005629539489746,
+      "learning_rate": 2.9456094364351244e-05,
+      "loss": 0.8127,
+      "step": 899
+    },
+    {
+      "epoch": 0.013429215040720846,
+      "grad_norm": 2.970745325088501,
+      "learning_rate": 3.0471821756225428e-05,
+      "loss": 0.8126,
+      "step": 930
+    },
+    {
+      "epoch": 0.013876855542078207,
+      "grad_norm": 2.4761953353881836,
+      "learning_rate": 3.148754914809961e-05,
+      "loss": 0.82,
+      "step": 961
+    },
+    {
+      "epoch": 0.014324496043435568,
+      "grad_norm": 2.8555397987365723,
+      "learning_rate": 3.2503276539973785e-05,
+      "loss": 0.8166,
+      "step": 992
+    },
+    {
+      "epoch": 0.01477213654479293,
+      "grad_norm": 2.8124194145202637,
+      "learning_rate": 3.351900393184797e-05,
+      "loss": 0.8057,
+      "step": 1023
+    },
+    {
+      "epoch": 0.015219777046150292,
+      "grad_norm": 2.353851556777954,
+      "learning_rate": 3.453473132372215e-05,
+      "loss": 0.8064,
+      "step": 1054
+    },
+    {
+      "epoch": 0.015667417547507653,
+      "grad_norm": 3.0127620697021484,
+      "learning_rate": 3.555045871559633e-05,
+      "loss": 0.8086,
+      "step": 1085
+    },
+    {
+      "epoch": 0.016115058048865014,
+      "grad_norm": 2.792686939239502,
+      "learning_rate": 3.6566186107470514e-05,
+      "loss": 0.8152,
+      "step": 1116
+    },
+    {
+      "epoch": 0.016562698550222375,
+      "grad_norm": 2.407134532928467,
+      "learning_rate": 3.7581913499344695e-05,
+      "loss": 0.7949,
+      "step": 1147
+    },
+    {
+      "epoch": 0.017010339051579736,
+      "grad_norm": 2.6921393871307373,
+      "learning_rate": 3.8597640891218876e-05,
+      "loss": 0.804,
+      "step": 1178
+    },
+    {
+      "epoch": 0.0174579795529371,
+      "grad_norm": 2.3015975952148438,
+      "learning_rate": 3.9613368283093056e-05,
+      "loss": 0.7944,
+      "step": 1209
+    },
+    {
+      "epoch": 0.017905620054294462,
+      "grad_norm": 2.8116579055786133,
+      "learning_rate": 4.062909567496724e-05,
+      "loss": 0.7977,
+      "step": 1240
+    },
+    {
+      "epoch": 0.018353260555651823,
+      "grad_norm": 2.5720036029815674,
+      "learning_rate": 4.164482306684142e-05,
+      "loss": 0.7854,
+      "step": 1271
+    },
+    {
+      "epoch": 0.018800901057009184,
+      "grad_norm": 2.0802650451660156,
+      "learning_rate": 4.26605504587156e-05,
+      "loss": 0.7892,
+      "step": 1302
+    },
+    {
+      "epoch": 0.019248541558366545,
+      "grad_norm": 2.4343624114990234,
+      "learning_rate": 4.367627785058978e-05,
+      "loss": 0.7897,
+      "step": 1333
+    },
+    {
+      "epoch": 0.019696182059723906,
+      "grad_norm": 2.509686231613159,
+      "learning_rate": 4.469200524246396e-05,
+      "loss": 0.7855,
+      "step": 1364
+    },
+    {
+      "epoch": 0.020143822561081267,
+      "grad_norm": 2.626512289047241,
+      "learning_rate": 4.570773263433814e-05,
+      "loss": 0.7873,
+      "step": 1395
+    },
+    {
+      "epoch": 0.02059146306243863,
+      "grad_norm": 2.8619399070739746,
+      "learning_rate": 4.672346002621232e-05,
+      "loss": 0.7891,
+      "step": 1426
+    },
+    {
+      "epoch": 0.021039103563795993,
+      "grad_norm": 2.724792718887329,
+      "learning_rate": 4.77391874180865e-05,
+      "loss": 0.782,
+      "step": 1457
+    },
+    {
+      "epoch": 0.021486744065153354,
+      "grad_norm": 2.6659562587738037,
+      "learning_rate": 4.875491480996068e-05,
+      "loss": 0.7856,
+      "step": 1488
+    },
+    {
+      "epoch": 0.021934384566510715,
+      "grad_norm": 2.646078586578369,
+      "learning_rate": 4.977064220183487e-05,
+      "loss": 0.7748,
+      "step": 1519
+    },
+    {
+      "epoch": 0.022382025067868076,
+      "grad_norm": 2.429288387298584,
+      "learning_rate": 4.9999915451558777e-05,
+      "loss": 0.7722,
+      "step": 1550
+    },
+    {
+      "epoch": 0.022829665569225437,
+      "grad_norm": 1.9933409690856934,
+      "learning_rate": 4.999955597496219e-05,
+      "loss": 0.7874,
+      "step": 1581
+    },
+    {
+      "epoch": 0.023277306070582798,
+      "grad_norm": 2.314889907836914,
+      "learning_rate": 4.9998914381774255e-05,
+      "loss": 0.7757,
+      "step": 1612
+    },
+    {
+      "epoch": 0.023724946571940162,
+      "grad_norm": 2.2891199588775635,
+      "learning_rate": 4.999799067923527e-05,
+      "loss": 0.7713,
+      "step": 1643
+    },
+    {
+      "epoch": 0.024172587073297523,
+      "grad_norm": 2.4892444610595703,
+      "learning_rate": 4.999678487776908e-05,
+      "loss": 0.7687,
+      "step": 1674
+    },
+    {
+      "epoch": 0.024620227574654884,
+      "grad_norm": 2.3015685081481934,
+      "learning_rate": 4.9995296990983006e-05,
+      "loss": 0.7721,
+      "step": 1705
+    },
+    {
+      "epoch": 0.025067868076012245,
+      "grad_norm": 2.278954029083252,
+      "learning_rate": 4.999352703566763e-05,
+      "loss": 0.7741,
+      "step": 1736
+    },
+    {
+      "epoch": 0.025515508577369606,
+      "grad_norm": 1.7260370254516602,
+      "learning_rate": 4.999147503179668e-05,
+      "loss": 0.7681,
+      "step": 1767
+    },
+    {
+      "epoch": 0.025963149078726967,
+      "grad_norm": 2.0179309844970703,
+      "learning_rate": 4.998914100252672e-05,
+      "loss": 0.7604,
+      "step": 1798
+    },
+    {
+      "epoch": 0.02641078958008433,
+      "grad_norm": 2.53022837638855,
+      "learning_rate": 4.998652497419696e-05,
+      "loss": 0.7598,
+      "step": 1829
+    },
+    {
+      "epoch": 0.026858430081441693,
+      "grad_norm": 1.859253168106079,
+      "learning_rate": 4.9983626976328927e-05,
+      "loss": 0.7606,
+      "step": 1860
+    },
+    {
+      "epoch": 0.027306070582799054,
+      "grad_norm": 1.759303331375122,
+      "learning_rate": 4.998044704162613e-05,
+      "loss": 0.7532,
+      "step": 1891
+    },
+    {
+      "epoch": 0.027753711084156415,
+      "grad_norm": 2.4389419555664062,
+      "learning_rate": 4.9976985205973705e-05,
+      "loss": 0.7646,
+      "step": 1922
+    },
+    {
+      "epoch": 0.028201351585513776,
+      "grad_norm": 2.155348777770996,
+      "learning_rate": 4.997324150843799e-05,
+      "loss": 0.7569,
+      "step": 1953
+    },
+    {
+      "epoch": 0.028648992086871137,
+      "grad_norm": 2.0138537883758545,
+      "learning_rate": 4.99692159912661e-05,
+      "loss": 0.7677,
+      "step": 1984
+    },
+    {
+      "epoch": 0.029096632588228498,
+      "grad_norm": 2.5275282859802246,
+      "learning_rate": 4.996490869988546e-05,
+      "loss": 0.7519,
+      "step": 2015
+    },
+    {
+      "epoch": 0.02954427308958586,
+      "grad_norm": 1.8147333860397339,
+      "learning_rate": 4.996031968290326e-05,
+      "loss": 0.7509,
+      "step": 2046
+    },
+    {
+      "epoch": 0.029991913590943223,
+      "grad_norm": 2.1941769123077393,
+      "learning_rate": 4.995544899210594e-05,
+      "loss": 0.754,
+      "step": 2077
+    },
+    {
+      "epoch": 0.030439554092300584,
+      "grad_norm": 1.8953059911727905,
+      "learning_rate": 4.9950296682458583e-05,
+      "loss": 0.747,
+      "step": 2108
+    },
+    {
+      "epoch": 0.030887194593657945,
+      "grad_norm": 3.3973031044006348,
+      "learning_rate": 4.994486281210429e-05,
+      "loss": 0.7513,
+      "step": 2139
+    },
+    {
+      "epoch": 0.031334835095015307,
+      "grad_norm": 2.66795015335083,
+      "learning_rate": 4.9939147442363566e-05,
+      "loss": 0.7469,
+      "step": 2170
+    },
+    {
+      "epoch": 0.03178247559637267,
+      "grad_norm": 1.6254230737686157,
+      "learning_rate": 4.9933150637733574e-05,
+      "loss": 0.7297,
+      "step": 2201
+    },
+    {
+      "epoch": 0.03223011609773003,
+      "grad_norm": 1.822745680809021,
+      "learning_rate": 4.992687246588743e-05,
+      "loss": 0.754,
+      "step": 2232
+    },
+    {
+      "epoch": 0.03267775659908739,
+      "grad_norm": 1.6898781061172485,
+      "learning_rate": 4.992031299767347e-05,
+      "loss": 0.7478,
+      "step": 2263
+    },
+    {
+      "epoch": 0.03312539710044475,
+      "grad_norm": 1.799280047416687,
+      "learning_rate": 4.9913472307114386e-05,
+      "loss": 0.746,
+      "step": 2294
+    },
+    {
+      "epoch": 0.033573037601802115,
+      "grad_norm": 2.2501840591430664,
+      "learning_rate": 4.9906350471406446e-05,
+      "loss": 0.7408,
+      "step": 2325
+    },
+    {
+      "epoch": 0.03402067810315947,
+      "grad_norm": 2.3315324783325195,
+      "learning_rate": 4.989894757091861e-05,
+      "loss": 0.7301,
+      "step": 2356
+    },
+    {
+      "epoch": 0.03446831860451684,
+      "grad_norm": 1.5820438861846924,
+      "learning_rate": 4.989126368919158e-05,
+      "loss": 0.7305,
+      "step": 2387
+    },
+    {
+      "epoch": 0.0349159591058742,
+      "grad_norm": 2.5696022510528564,
+      "learning_rate": 4.988329891293693e-05,
+      "loss": 0.7337,
+      "step": 2418
+    },
+    {
+      "epoch": 0.03536359960723156,
+      "grad_norm": 1.8880938291549683,
+      "learning_rate": 4.987505333203608e-05,
+      "loss": 0.7385,
+      "step": 2449
+    },
+    {
+      "epoch": 0.035811240108588924,
+      "grad_norm": 2.6148738861083984,
+      "learning_rate": 4.9866527039539276e-05,
+      "loss": 0.7292,
+      "step": 2480
+    },
+    {
+      "epoch": 0.03625888060994628,
+      "grad_norm": 1.6925290822982788,
+      "learning_rate": 4.9857720131664594e-05,
+      "loss": 0.7344,
+      "step": 2511
+    },
+    {
+      "epoch": 0.036706521111303646,
+      "grad_norm": 1.7675210237503052,
+      "learning_rate": 4.9848632707796773e-05,
+      "loss": 0.7354,
+      "step": 2542
+    },
+    {
+      "epoch": 0.037154161612661,
+      "grad_norm": 2.1053173542022705,
+      "learning_rate": 4.9839264870486155e-05,
+      "loss": 0.7272,
+      "step": 2573
+    },
+    {
+      "epoch": 0.03760180211401837,
+      "grad_norm": 1.9718347787857056,
+      "learning_rate": 4.9829616725447526e-05,
+      "loss": 0.7336,
+      "step": 2604
+    },
+    {
+      "epoch": 0.03804944261537573,
+      "grad_norm": 1.5777671337127686,
+      "learning_rate": 4.981968838155888e-05,
+      "loss": 0.7182,
+      "step": 2635
+    },
+    {
+      "epoch": 0.03849708311673309,
+      "grad_norm": 1.905127763748169,
+      "learning_rate": 4.980947995086024e-05,
+      "loss": 0.7296,
+      "step": 2666
+    },
+    {
+      "epoch": 0.038944723618090454,
+      "grad_norm": 1.63962721824646,
+      "learning_rate": 4.979899154855234e-05,
+      "loss": 0.7249,
+      "step": 2697
+    },
+    {
+      "epoch": 0.03939236411944781,
+      "grad_norm": 1.584331750869751,
+      "learning_rate": 4.9788223292995386e-05,
+      "loss": 0.7345,
+      "step": 2728
+    },
+    {
+      "epoch": 0.039840004620805176,
+      "grad_norm": 1.9111014604568481,
+      "learning_rate": 4.977717530570768e-05,
+      "loss": 0.7225,
+      "step": 2759
+    },
+    {
+      "epoch": 0.040287645122162534,
+      "grad_norm": 2.3216073513031006,
+      "learning_rate": 4.976584771136425e-05,
+      "loss": 0.7207,
+      "step": 2790
+    },
+    {
+      "epoch": 0.0407352856235199,
+      "grad_norm": 1.6002410650253296,
+      "learning_rate": 4.975424063779547e-05,
+      "loss": 0.7228,
+      "step": 2821
+    },
+    {
+      "epoch": 0.04118292612487726,
+      "grad_norm": 2.104731798171997,
+      "learning_rate": 4.974235421598557e-05,
+      "loss": 0.7127,
+      "step": 2852
+    },
+    {
+      "epoch": 0.04163056662623462,
+      "grad_norm": 1.7114660739898682,
+      "learning_rate": 4.973018858007122e-05,
+      "loss": 0.7283,
+      "step": 2883
+    },
+    {
+      "epoch": 0.042078207127591985,
+      "grad_norm": 1.948133945465088,
+      "learning_rate": 4.9717743867339963e-05,
+      "loss": 0.7209,
+      "step": 2914
+    },
+    {
+      "epoch": 0.04252584762894934,
+      "grad_norm": 1.621764898300171,
+      "learning_rate": 4.9705020218228695e-05,
+      "loss": 0.7272,
+      "step": 2945
+    },
+    {
+      "epoch": 0.04297348813030671,
+      "grad_norm": 1.6967558860778809,
+      "learning_rate": 4.969201777632205e-05,
+      "loss": 0.7191,
+      "step": 2976
+    },
+    {
+      "epoch": 0.043421128631664065,
+      "grad_norm": 1.6656996011734009,
+      "learning_rate": 4.9678736688350846e-05,
+      "loss": 0.7205,
+      "step": 3007
+    },
+    {
+      "epoch": 0.04386876913302143,
+      "grad_norm": 2.151475191116333,
+      "learning_rate": 4.966517710419033e-05,
+      "loss": 0.7168,
+      "step": 3038
+    },
+    {
+      "epoch": 0.044316409634378794,
+      "grad_norm": 2.213109016418457,
+      "learning_rate": 4.965133917685858e-05,
+      "loss": 0.7139,
+      "step": 3069
+    },
+    {
+      "epoch": 0.04476405013573615,
+      "grad_norm": 1.5380377769470215,
+      "learning_rate": 4.9637223062514714e-05,
+      "loss": 0.7237,
+      "step": 3100
+    },
+    {
+      "epoch": 0.045211690637093516,
+      "grad_norm": 2.312377452850342,
+      "learning_rate": 4.962282892045718e-05,
+      "loss": 0.7156,
+      "step": 3131
+    },
+    {
+      "epoch": 0.04565933113845087,
+      "grad_norm": 1.7220717668533325,
+      "learning_rate": 4.9608156913121904e-05,
+      "loss": 0.7122,
+      "step": 3162
+    },
+    {
+      "epoch": 0.04610697163980824,
+      "grad_norm": 1.802856206893921,
+      "learning_rate": 4.959320720608049e-05,
+      "loss": 0.7128,
+      "step": 3193
+    },
+    {
+      "epoch": 0.046554612141165595,
+      "grad_norm": 1.6629964113235474,
+      "learning_rate": 4.9577979968038354e-05,
+      "loss": 0.7172,
+      "step": 3224
+    },
+    {
+      "epoch": 0.04700225264252296,
+      "grad_norm": 3.440115213394165,
+      "learning_rate": 4.956247537083282e-05,
+      "loss": 0.7213,
+      "step": 3255
+    },
+    {
+      "epoch": 0.047449893143880324,
+      "grad_norm": 1.5721139907836914,
+      "learning_rate": 4.9546693589431145e-05,
+      "loss": 0.7148,
+      "step": 3286
+    },
+    {
+      "epoch": 0.04789753364523768,
+      "grad_norm": 2.0920398235321045,
+      "learning_rate": 4.9530634801928595e-05,
+      "loss": 0.7145,
+      "step": 3317
+    },
+    {
+      "epoch": 0.048345174146595046,
+      "grad_norm": 1.666566014289856,
+      "learning_rate": 4.9514299189546395e-05,
+      "loss": 0.7095,
+      "step": 3348
+    },
+    {
+      "epoch": 0.048792814647952404,
+      "grad_norm": 1.8222129344940186,
+      "learning_rate": 4.949768693662973e-05,
+      "loss": 0.7138,
+      "step": 3379
+    },
+    {
+      "epoch": 0.04924045514930977,
+      "grad_norm": 1.7302964925765991,
+      "learning_rate": 4.948079823064559e-05,
+      "loss": 0.7017,
+      "step": 3410
+    },
+    {
+      "epoch": 0.049688095650667126,
+      "grad_norm": 1.7338463068008423,
+      "learning_rate": 4.946363326218074e-05,
+      "loss": 0.6979,
+      "step": 3441
+    },
+    {
+      "epoch": 0.05013573615202449,
+      "grad_norm": 1.5637450218200684,
+      "learning_rate": 4.9446192224939525e-05,
+      "loss": 0.7011,
+      "step": 3472
+    },
+    {
+      "epoch": 0.050583376653381855,
+      "grad_norm": 1.5632222890853882,
+      "learning_rate": 4.942847531574167e-05,
+      "loss": 0.704,
+      "step": 3503
+    },
+    {
+      "epoch": 0.05103101715473921,
+      "grad_norm": 1.588402509689331,
+      "learning_rate": 4.941048273452008e-05,
+      "loss": 0.7011,
+      "step": 3534
+    },
+    {
+      "epoch": 0.05147865765609658,
+      "grad_norm": 1.8840582370758057,
+      "learning_rate": 4.9392214684318605e-05,
+      "loss": 0.7016,
+      "step": 3565
+    },
+    {
+      "epoch": 0.051926298157453935,
+      "grad_norm": 1.2702268362045288,
+      "learning_rate": 4.93736713712897e-05,
+      "loss": 0.7004,
+      "step": 3596
+    },
+    {
+      "epoch": 0.0523739386588113,
+      "grad_norm": 1.3812692165374756,
+      "learning_rate": 4.9354853004692124e-05,
+      "loss": 0.7046,
+      "step": 3627
+    },
+    {
+      "epoch": 0.05282157916016866,
+      "grad_norm": 1.7257345914840698,
+      "learning_rate": 4.93357597968886e-05,
+      "loss": 0.6976,
+      "step": 3658
+    },
+    {
+      "epoch": 0.05326921966152602,
+      "grad_norm": 1.7458925247192383,
+      "learning_rate": 4.931639196334338e-05,
+      "loss": 0.6997,
+      "step": 3689
+    },
+    {
+      "epoch": 0.053716860162883386,
+      "grad_norm": 2.1996099948883057,
+      "learning_rate": 4.9296749722619826e-05,
+      "loss": 0.6991,
+      "step": 3720
+    },
+    {
+      "epoch": 0.05416450066424074,
+      "grad_norm": 1.6615021228790283,
+      "learning_rate": 4.9276833296377966e-05,
+      "loss": 0.7005,
+      "step": 3751
+    },
+    {
+      "epoch": 0.05461214116559811,
+      "grad_norm": 1.6276952028274536,
+      "learning_rate": 4.925664290937196e-05,
+      "loss": 0.7097,
+      "step": 3782
+    },
+    {
+      "epoch": 0.055059781666955465,
+      "grad_norm": 1.758227825164795,
+      "learning_rate": 4.9236178789447576e-05,
+      "loss": 0.6955,
+      "step": 3813
+    },
+    {
+      "epoch": 0.05550742216831283,
+      "grad_norm": 1.195280909538269,
+      "learning_rate": 4.921544116753962e-05,
+      "loss": 0.7073,
+      "step": 3844
+    },
+    {
+      "epoch": 0.05595506266967019,
+      "grad_norm": 1.6281015872955322,
+      "learning_rate": 4.919443027766935e-05,
+      "loss": 0.7022,
+      "step": 3875
+    },
+    {
+      "epoch": 0.05640270317102755,
+      "grad_norm": 1.3543150424957275,
+      "learning_rate": 4.91731463569418e-05,
+      "loss": 0.7036,
+      "step": 3906
+    },
+    {
+      "epoch": 0.056850343672384916,
+      "grad_norm": 2.16947078704834,
+      "learning_rate": 4.915158964554312e-05,
+      "loss": 0.7007,
+      "step": 3937
+    },
+    {
+      "epoch": 0.057297984173742274,
+      "grad_norm": 1.324578881263733,
+      "learning_rate": 4.912976038673786e-05,
+      "loss": 0.6941,
+      "step": 3968
+    },
+    {
+      "epoch": 0.05774562467509964,
+      "grad_norm": 1.9811108112335205,
+      "learning_rate": 4.9107658826866254e-05,
+      "loss": 0.6908,
+      "step": 3999
+    },
+    {
+      "epoch": 0.058193265176456996,
+      "grad_norm": 1.2975554466247559,
+      "learning_rate": 4.908528521534139e-05,
+      "loss": 0.6936,
+      "step": 4030
+    },
+    {
+      "epoch": 0.05864090567781436,
+      "grad_norm": 1.583282232284546,
+      "learning_rate": 4.906263980464644e-05,
+      "loss": 0.698,
+      "step": 4061
+    },
+    {
+      "epoch": 0.05908854617917172,
+      "grad_norm": 1.3532944917678833,
+      "learning_rate": 4.903972285033178e-05,
+      "loss": 0.7049,
+      "step": 4092
+    },
+    {
+      "epoch": 0.05953618668052908,
+      "grad_norm": 2.1245481967926025,
+      "learning_rate": 4.901653461101213e-05,
+      "loss": 0.7016,
+      "step": 4123
+    },
+    {
+      "epoch": 0.05998382718188645,
+      "grad_norm": 1.6913797855377197,
+      "learning_rate": 4.8993075348363626e-05,
+      "loss": 0.6981,
+      "step": 4154
+    },
+    {
+      "epoch": 0.060431467683243804,
+      "grad_norm": 1.51249098777771,
+      "learning_rate": 4.896934532712084e-05,
+      "loss": 0.6955,
+      "step": 4185
+    },
+    {
+      "epoch": 0.06087910818460117,
+      "grad_norm": 1.3880395889282227,
+      "learning_rate": 4.8945344815073846e-05,
+      "loss": 0.6934,
+      "step": 4216
+    },
+    {
+      "epoch": 0.061326748685958526,
+      "grad_norm": 1.6354159116744995,
+      "learning_rate": 4.892107408306516e-05,
+      "loss": 0.6938,
+      "step": 4247
+    },
+    {
+      "epoch": 0.06177438918731589,
+      "grad_norm": 2.126742362976074,
+      "learning_rate": 4.889653340498669e-05,
+      "loss": 0.7003,
+      "step": 4278
+    },
+    {
+      "epoch": 0.06222202968867325,
+      "grad_norm": 1.7903707027435303,
+      "learning_rate": 4.8871723057776664e-05,
+      "loss": 0.6885,
+      "step": 4309
+    },
+    {
+      "epoch": 0.06266967019003061,
+      "grad_norm": 1.537806510925293,
+      "learning_rate": 4.8846643321416476e-05,
+      "loss": 0.6892,
+      "step": 4340
+    },
+    {
+      "epoch": 0.06311731069138797,
+      "grad_norm": 1.6445434093475342,
+      "learning_rate": 4.882129447892753e-05,
+      "loss": 0.6843,
+      "step": 4371
+    },
+    {
+      "epoch": 0.06356495119274534,
+      "grad_norm": 1.555373191833496,
+      "learning_rate": 4.8795676816368076e-05,
+      "loss": 0.6899,
+      "step": 4402
+    },
+    {
+      "epoch": 0.0640125916941027,
+      "grad_norm": 1.8370277881622314,
+      "learning_rate": 4.876979062282995e-05,
+      "loss": 0.6813,
+      "step": 4433
+    },
+    {
+      "epoch": 0.06446023219546006,
+      "grad_norm": 1.3132514953613281,
+      "learning_rate": 4.8743636190435325e-05,
+      "loss": 0.6832,
+      "step": 4464
+    },
+    {
+      "epoch": 0.06490787269681741,
+      "grad_norm": 1.3186298608779907,
+      "learning_rate": 4.871721381433344e-05,
+      "loss": 0.6879,
+      "step": 4495
+    },
+    {
+      "epoch": 0.06535551319817479,
+      "grad_norm": 1.4360268115997314,
+      "learning_rate": 4.869052379269719e-05,
+      "loss": 0.69,
+      "step": 4526
+    },
+    {
+      "epoch": 0.06580315369953214,
+      "grad_norm": 1.670765995979309,
+      "learning_rate": 4.866356642671985e-05,
+      "loss": 0.6865,
+      "step": 4557
+    },
+    {
+      "epoch": 0.0662507942008895,
+      "grad_norm": 1.7548723220825195,
+      "learning_rate": 4.8636342020611634e-05,
+      "loss": 0.6852,
+      "step": 4588
+    },
+    {
+      "epoch": 0.06669843470224687,
+      "grad_norm": 1.5086426734924316,
+      "learning_rate": 4.860885088159626e-05,
+      "loss": 0.6894,
+      "step": 4619
+    },
+    {
+      "epoch": 0.06714607520360423,
+      "grad_norm": 1.3140665292739868,
+      "learning_rate": 4.858109331990751e-05,
+      "loss": 0.6812,
+      "step": 4650
+    },
+    {
+      "epoch": 0.06759371570496159,
+      "grad_norm": 1.4212454557418823,
+      "learning_rate": 4.855306964878567e-05,
+      "loss": 0.6872,
+      "step": 4681
+    },
+    {
+      "epoch": 0.06804135620631895,
+      "grad_norm": 1.3034414052963257,
+      "learning_rate": 4.8524780184474084e-05,
+      "loss": 0.6901,
+      "step": 4712
+    },
+    {
+      "epoch": 0.06848899670767632,
+      "grad_norm": 1.3741438388824463,
+      "learning_rate": 4.8496225246215496e-05,
+      "loss": 0.6875,
+      "step": 4743
+    },
+    {
+      "epoch": 0.06893663720903367,
+      "grad_norm": 1.7262542247772217,
+      "learning_rate": 4.8467405156248505e-05,
+      "loss": 0.6868,
+      "step": 4774
+    },
+    {
+      "epoch": 0.06938427771039103,
+      "grad_norm": 1.3293650150299072,
+      "learning_rate": 4.843832023980392e-05,
+      "loss": 0.6891,
+      "step": 4805
+    },
+    {
+      "epoch": 0.0698319182117484,
+      "grad_norm": 1.3448151350021362,
+      "learning_rate": 4.840897082510106e-05,
+      "loss": 0.6765,
+      "step": 4836
+    },
+    {
+      "epoch": 0.07027955871310576,
+      "grad_norm": 2.961280584335327,
+      "learning_rate": 4.8379357243344084e-05,
+      "loss": 0.6939,
+      "step": 4867
+    },
+    {
+      "epoch": 0.07072719921446312,
+      "grad_norm": 1.8265361785888672,
+      "learning_rate": 4.8349479828718236e-05,
+      "loss": 0.677,
+      "step": 4898
+    },
+    {
+      "epoch": 0.07117483971582048,
+      "grad_norm": 1.490349531173706,
+      "learning_rate": 4.8319338918386075e-05,
+      "loss": 0.6778,
+      "step": 4929
+    },
+    {
+      "epoch": 0.07162248021717785,
+      "grad_norm": 1.3669307231903076,
+      "learning_rate": 4.828893485248369e-05,
+      "loss": 0.6746,
+      "step": 4960
+    },
+    {
+      "epoch": 0.0720701207185352,
+      "grad_norm": 1.3995884656906128,
+      "learning_rate": 4.825826797411682e-05,
+      "loss": 0.6757,
+      "step": 4991
+    },
+    {
+      "epoch": 0.07251776121989256,
+      "grad_norm": 1.1217372417449951,
+      "learning_rate": 4.822733862935702e-05,
+      "loss": 0.6832,
+      "step": 5022
+    },
+    {
+      "epoch": 0.07296540172124993,
+      "grad_norm": 1.2192097902297974,
+      "learning_rate": 4.819614716723775e-05,
+      "loss": 0.6868,
+      "step": 5053
+    },
+    {
+      "epoch": 0.07341304222260729,
+      "grad_norm": 1.5045067071914673,
+      "learning_rate": 4.8164693939750425e-05,
+      "loss": 0.6793,
+      "step": 5084
+    },
+    {
+      "epoch": 0.07386068272396465,
+      "grad_norm": 1.7127234935760498,
+      "learning_rate": 4.813297930184042e-05,
+      "loss": 0.6797,
+      "step": 5115
+    },
+    {
+      "epoch": 0.074308323225322,
+      "grad_norm": 1.846561312675476,
+      "learning_rate": 4.810100361140314e-05,
+      "loss": 0.6767,
+      "step": 5146
+    },
+    {
+      "epoch": 0.07475596372667938,
+      "grad_norm": 1.3076797723770142,
+      "learning_rate": 4.8068767229279885e-05,
+      "loss": 0.6855,
+      "step": 5177
+    },
+    {
+      "epoch": 0.07520360422803674,
+      "grad_norm": 1.4170383214950562,
+      "learning_rate": 4.8036270519253854e-05,
+      "loss": 0.681,
+      "step": 5208
+    },
+    {
+      "epoch": 0.0756512447293941,
+      "grad_norm": 1.2504942417144775,
+      "learning_rate": 4.8003513848046e-05,
+      "loss": 0.6778,
+      "step": 5239
+    },
+    {
+      "epoch": 0.07609888523075146,
+      "grad_norm": 1.1522283554077148,
+      "learning_rate": 4.79704975853109e-05,
+      "loss": 0.6749,
+      "step": 5270
+    },
+    {
+      "epoch": 0.07654652573210882,
+      "grad_norm": 1.6351525783538818,
+      "learning_rate": 4.793722210363262e-05,
+      "loss": 0.6745,
+      "step": 5301
+    },
+    {
+      "epoch": 0.07699416623346618,
+      "grad_norm": 1.5093014240264893,
+      "learning_rate": 4.7903687778520414e-05,
+      "loss": 0.6747,
+      "step": 5332
+    },
+    {
+      "epoch": 0.07744180673482354,
+      "grad_norm": 1.362160563468933,
+      "learning_rate": 4.7869894988404593e-05,
+      "loss": 0.673,
+      "step": 5363
+    },
+    {
+      "epoch": 0.07788944723618091,
+      "grad_norm": 1.2021727561950684,
+      "learning_rate": 4.783584411463221e-05,
+      "loss": 0.6768,
+      "step": 5394
+    },
+    {
+      "epoch": 0.07833708773753827,
+      "grad_norm": 2.1543540954589844,
+      "learning_rate": 4.780153554146274e-05,
+      "loss": 0.672,
+      "step": 5425
+    },
+    {
+      "epoch": 0.07878472823889562,
+      "grad_norm": 1.882712721824646,
+      "learning_rate": 4.7766969656063766e-05,
+      "loss": 0.6926,
+      "step": 5456
+    },
+    {
+      "epoch": 0.079232368740253,
+      "grad_norm": 1.3975650072097778,
+      "learning_rate": 4.773214684850662e-05,
+      "loss": 0.6747,
+      "step": 5487
+    },
+    {
+      "epoch": 0.07968000924161035,
+      "grad_norm": 1.3912913799285889,
+      "learning_rate": 4.769706751176193e-05,
+      "loss": 0.6756,
+      "step": 5518
+    },
+    {
+      "epoch": 0.08012764974296771,
+      "grad_norm": 1.7227635383605957,
+      "learning_rate": 4.7661732041695264e-05,
+      "loss": 0.6694,
+      "step": 5549
+    },
+    {
+      "epoch": 0.08057529024432507,
+      "grad_norm": 1.3151129484176636,
+      "learning_rate": 4.762614083706258e-05,
+      "loss": 0.6715,
+      "step": 5580
+    },
+    {
+      "epoch": 0.08102293074568244,
+      "grad_norm": 1.0972425937652588,
+      "learning_rate": 4.759029429950581e-05,
+      "loss": 0.6661,
+      "step": 5611
+    },
+    {
+      "epoch": 0.0814705712470398,
+      "grad_norm": 1.2346575260162354,
+      "learning_rate": 4.7554192833548235e-05,
+      "loss": 0.66,
+      "step": 5642
+    },
+    {
+      "epoch": 0.08191821174839715,
+      "grad_norm": 1.4536516666412354,
+      "learning_rate": 4.751783684659e-05,
+      "loss": 0.6743,
+      "step": 5673
+    },
+    {
+      "epoch": 0.08236585224975453,
+      "grad_norm": 1.1361631155014038,
+      "learning_rate": 4.748122674890348e-05,
+      "loss": 0.6791,
+      "step": 5704
+    },
+    {
+      "epoch": 0.08281349275111188,
+      "grad_norm": 1.2605111598968506,
+      "learning_rate": 4.7444362953628654e-05,
+      "loss": 0.6797,
+      "step": 5735
+    },
+    {
+      "epoch": 0.08326113325246924,
+      "grad_norm": 1.2355903387069702,
+      "learning_rate": 4.7407245876768424e-05,
+      "loss": 0.6642,
+      "step": 5766
+    },
+    {
+      "epoch": 0.0837087737538266,
+      "grad_norm": 1.6677048206329346,
+      "learning_rate": 4.736987593718397e-05,
+      "loss": 0.6759,
+      "step": 5797
+    },
+    {
+      "epoch": 0.08415641425518397,
+      "grad_norm": 1.4781981706619263,
+      "learning_rate": 4.733225355658999e-05,
+      "loss": 0.6707,
+      "step": 5828
+    },
+    {
+      "epoch": 0.08460405475654133,
+      "grad_norm": 1.138583779335022,
+      "learning_rate": 4.7294379159549926e-05,
+      "loss": 0.6636,
+      "step": 5859
+    },
+    {
+      "epoch": 0.08505169525789869,
+      "grad_norm": 1.529036283493042,
+      "learning_rate": 4.725625317347119e-05,
+      "loss": 0.6705,
+      "step": 5890
+    },
+    {
+      "epoch": 0.08549933575925606,
+      "grad_norm": 1.3216760158538818,
+      "learning_rate": 4.7217876028600374e-05,
+      "loss": 0.6714,
+      "step": 5921
+    },
+    {
+      "epoch": 0.08594697626061341,
+      "grad_norm": 1.1820168495178223,
+      "learning_rate": 4.717924815801832e-05,
+      "loss": 0.6757,
+      "step": 5952
+    },
+    {
+      "epoch": 0.08639461676197077,
+      "grad_norm": 1.393571138381958,
+      "learning_rate": 4.714036999763532e-05,
+      "loss": 0.6672,
+      "step": 5983
+    },
+    {
+      "epoch": 0.08684225726332813,
+      "grad_norm": 1.4574682712554932,
+      "learning_rate": 4.7101241986186116e-05,
+      "loss": 0.6655,
+      "step": 6014
+    },
+    {
+      "epoch": 0.0872898977646855,
+      "grad_norm": 1.138645887374878,
+      "learning_rate": 4.7061864565225e-05,
+      "loss": 0.6663,
+      "step": 6045
+    },
+    {
+      "epoch": 0.08773753826604286,
+      "grad_norm": 1.7602777481079102,
+      "learning_rate": 4.702223817912081e-05,
+      "loss": 0.6695,
+      "step": 6076
+    },
+    {
+      "epoch": 0.08818517876740022,
+      "grad_norm": 1.2323459386825562,
+      "learning_rate": 4.698236327505195e-05,
+      "loss": 0.6636,
+      "step": 6107
+    },
+    {
+      "epoch": 0.08863281926875759,
+      "grad_norm": 1.6881431341171265,
+      "learning_rate": 4.694224030300127e-05,
+      "loss": 0.6653,
+      "step": 6138
+    },
+    {
+      "epoch": 0.08908045977011494,
+      "grad_norm": 1.391417384147644,
+      "learning_rate": 4.690186971575107e-05,
+      "loss": 0.6636,
+      "step": 6169
+    },
+    {
+      "epoch": 0.0895281002714723,
+      "grad_norm": 1.3066257238388062,
+      "learning_rate": 4.6861251968877916e-05,
+      "loss": 0.6777,
+      "step": 6200
+    },
+    {
+      "epoch": 0.08997574077282966,
+      "grad_norm": 1.2001326084136963,
+      "learning_rate": 4.68203875207476e-05,
+      "loss": 0.6683,
+      "step": 6231
+    },
+    {
+      "epoch": 0.09042338127418703,
+      "grad_norm": 1.4361172914505005,
+      "learning_rate": 4.677927683250983e-05,
+      "loss": 0.6634,
+      "step": 6262
+    },
+    {
+      "epoch": 0.09087102177554439,
+      "grad_norm": 8.04520320892334,
+      "learning_rate": 4.6737920368093156e-05,
+      "loss": 0.6752,
+      "step": 6293
+    },
+    {
+      "epoch": 0.09131866227690175,
+      "grad_norm": 1.4874210357666016,
+      "learning_rate": 4.669631859419965e-05,
+      "loss": 0.6733,
+      "step": 6324
+    },
+    {
+      "epoch": 0.09176630277825912,
+      "grad_norm": 1.234491229057312,
+      "learning_rate": 4.6654471980299676e-05,
+      "loss": 0.668,
+      "step": 6355
+    },
+    {
+      "epoch": 0.09221394327961648,
+      "grad_norm": 1.2088687419891357,
+      "learning_rate": 4.661238099862658e-05,
+      "loss": 0.6705,
+      "step": 6386
+    },
+    {
+      "epoch": 0.09266158378097383,
+      "grad_norm": 1.1937814950942993,
+      "learning_rate": 4.657004612417138e-05,
+      "loss": 0.6853,
+      "step": 6417
+    },
+    {
+      "epoch": 0.09310922428233119,
+      "grad_norm": 1.5205374956130981,
+      "learning_rate": 4.6527467834677374e-05,
+      "loss": 0.685,
+      "step": 6448
+    },
+    {
+      "epoch": 0.09355686478368856,
+      "grad_norm": 1.2221660614013672,
+      "learning_rate": 4.648464661063478e-05,
+      "loss": 0.6622,
+      "step": 6479
+    },
+    {
+      "epoch": 0.09400450528504592,
+      "grad_norm": 1.0762608051300049,
+      "learning_rate": 4.6441582935275264e-05,
+      "loss": 0.669,
+      "step": 6510
+    },
+    {
+      "epoch": 0.09445214578640328,
+      "grad_norm": 1.4416946172714233,
+      "learning_rate": 4.6398277294566586e-05,
+      "loss": 0.6674,
+      "step": 6541
+    },
+    {
+      "epoch": 0.09489978628776065,
+      "grad_norm": 1.559158205986023,
+      "learning_rate": 4.6354730177207e-05,
+      "loss": 0.6681,
+      "step": 6572
+    },
+    {
+      "epoch": 0.095347426789118,
+      "grad_norm": 1.3833891153335571,
+      "learning_rate": 4.6310942074619787e-05,
+      "loss": 0.6681,
+      "step": 6603
+    },
+    {
+      "epoch": 0.09579506729047536,
+      "grad_norm": 1.6753300428390503,
+      "learning_rate": 4.626691348094777e-05,
+      "loss": 0.6658,
+      "step": 6634
+    },
+    {
+      "epoch": 0.09624270779183272,
+      "grad_norm": 1.951198697090149,
+      "learning_rate": 4.622264489304762e-05,
+      "loss": 0.6654,
+      "step": 6665
+    },
+    {
+      "epoch": 0.09669034829319009,
+      "grad_norm": 1.2356919050216675,
+      "learning_rate": 4.617813681048434e-05,
+      "loss": 0.6651,
+      "step": 6696
+    },
+    {
+      "epoch": 0.09713798879454745,
+      "grad_norm": 1.2712593078613281,
+      "learning_rate": 4.61333897355256e-05,
+      "loss": 0.6646,
+      "step": 6727
+    },
+    {
+      "epoch": 0.09758562929590481,
+      "grad_norm": 1.1935900449752808,
+      "learning_rate": 4.608840417313604e-05,
+      "loss": 0.674,
+      "step": 6758
+    },
+    {
+      "epoch": 0.09803326979726218,
+      "grad_norm": 1.1649430990219116,
+      "learning_rate": 4.6043180630971646e-05,
+      "loss": 0.6644,
+      "step": 6789
+    },
+    {
+      "epoch": 0.09848091029861954,
+      "grad_norm": 1.4281456470489502,
+      "learning_rate": 4.599771961937391e-05,
+      "loss": 0.6673,
+      "step": 6820
+    },
+    {
+      "epoch": 0.0989285507999769,
+      "grad_norm": 1.3064521551132202,
+      "learning_rate": 4.5952021651364204e-05,
+      "loss": 0.6584,
+      "step": 6851
+    },
+    {
+      "epoch": 0.09937619130133425,
+      "grad_norm": 1.2546554803848267,
+      "learning_rate": 4.590608724263786e-05,
+      "loss": 0.6612,
+      "step": 6882
+    },
+    {
+      "epoch": 0.09982383180269162,
+      "grad_norm": 1.1866974830627441,
+      "learning_rate": 4.585991691155845e-05,
+      "loss": 0.6612,
+      "step": 6913
+    },
+    {
+      "epoch": 0.10027147230404898,
+      "grad_norm": 1.6166640520095825,
+      "learning_rate": 4.581351117915188e-05,
+      "loss": 0.6551,
+      "step": 6944
+    },
+    {
+      "epoch": 0.10071911280540634,
+      "grad_norm": 1.5471700429916382,
+      "learning_rate": 4.5766870569100534e-05,
+      "loss": 0.6607,
+      "step": 6975
+    },
+    {
+      "epoch": 0.10116675330676371,
+      "grad_norm": 1.3361026048660278,
+      "learning_rate": 4.571999560773736e-05,
+      "loss": 0.666,
+      "step": 7006
+    },
+    {
+      "epoch": 0.10161439380812107,
+      "grad_norm": 1.2938140630722046,
+      "learning_rate": 4.5672886824039915e-05,
+      "loss": 0.6547,
+      "step": 7037
+    },
+    {
+      "epoch": 0.10206203430947842,
+      "grad_norm": 1.2688400745391846,
+      "learning_rate": 4.5625544749624435e-05,
+      "loss": 0.6624,
+      "step": 7068
+    },
+    {
+      "epoch": 0.10250967481083578,
+      "grad_norm": 1.6306285858154297,
+      "learning_rate": 4.5577969918739794e-05,
+      "loss": 0.6627,
+      "step": 7099
+    },
+    {
+      "epoch": 0.10295731531219315,
+      "grad_norm": 1.3346176147460938,
+      "learning_rate": 4.5530162868261486e-05,
+      "loss": 0.6577,
+      "step": 7130
+    },
+    {
+      "epoch": 0.10340495581355051,
+      "grad_norm": 1.0933984518051147,
+      "learning_rate": 4.548212413768558e-05,
+      "loss": 0.6602,
+      "step": 7161
+    },
+    {
+      "epoch": 0.10385259631490787,
+      "grad_norm": 1.575859785079956,
+      "learning_rate": 4.543385426912261e-05,
+      "loss": 0.6593,
+      "step": 7192
+    },
+    {
+      "epoch": 0.10430023681626524,
+      "grad_norm": 1.4265861511230469,
+      "learning_rate": 4.53853538072915e-05,
+      "loss": 0.6564,
+      "step": 7223
+    },
+    {
+      "epoch": 0.1047478773176226,
+      "grad_norm": 1.737012267112732,
+      "learning_rate": 4.533662329951336e-05,
+      "loss": 0.6593,
+      "step": 7254
+    },
+    {
+      "epoch": 0.10519551781897996,
+      "grad_norm": 1.0257115364074707,
+      "learning_rate": 4.528766329570536e-05,
+      "loss": 0.6514,
+      "step": 7285
+    },
+    {
+      "epoch": 0.10564315832033731,
+      "grad_norm": 1.5043773651123047,
+      "learning_rate": 4.523847434837447e-05,
+      "loss": 0.6635,
+      "step": 7316
+    },
+    {
+      "epoch": 0.10609079882169468,
+      "grad_norm": 1.5642234086990356,
+      "learning_rate": 4.518905701261128e-05,
+      "loss": 0.6558,
+      "step": 7347
+    },
+    {
+      "epoch": 0.10653843932305204,
+      "grad_norm": 1.1821067333221436,
+      "learning_rate": 4.5139411846083715e-05,
+      "loss": 0.6686,
+      "step": 7378
+    },
+    {
+      "epoch": 0.1069860798244094,
+      "grad_norm": 1.5492759943008423,
+      "learning_rate": 4.508953940903073e-05,
+      "loss": 0.6543,
+      "step": 7409
+    },
+    {
+      "epoch": 0.10743372032576677,
+      "grad_norm": 1.281914234161377,
+      "learning_rate": 4.5039440264255994e-05,
+      "loss": 0.6516,
+      "step": 7440
+    },
+    {
+      "epoch": 0.10788136082712413,
+      "grad_norm": 1.3318305015563965,
+      "learning_rate": 4.498911497712155e-05,
+      "loss": 0.656,
+      "step": 7471
+    },
+    {
+      "epoch": 0.10832900132848149,
+      "grad_norm": 1.3832449913024902,
+      "learning_rate": 4.493856411554142e-05,
+      "loss": 0.6475,
+      "step": 7502
+    },
+    {
+      "epoch": 0.10877664182983884,
+      "grad_norm": 1.3547158241271973,
+      "learning_rate": 4.4887788249975206e-05,
+      "loss": 0.6594,
+      "step": 7533
+    },
+    {
+      "epoch": 0.10922428233119622,
+      "grad_norm": 1.4633681774139404,
+      "learning_rate": 4.4836787953421656e-05,
+      "loss": 0.6707,
+      "step": 7564
+    },
+    {
+      "epoch": 0.10967192283255357,
+      "grad_norm": 1.1781059503555298,
+      "learning_rate": 4.478556380141218e-05,
+      "loss": 0.6626,
+      "step": 7595
+    },
+    {
+      "epoch": 0.11011956333391093,
+      "grad_norm": 1.4727883338928223,
+      "learning_rate": 4.4734116372004375e-05,
+      "loss": 0.6535,
+      "step": 7626
+    },
+    {
+      "epoch": 0.1105672038352683,
+      "grad_norm": 1.3888640403747559,
+      "learning_rate": 4.4682446245775477e-05,
+      "loss": 0.6606,
+      "step": 7657
+    },
+    {
+      "epoch": 0.11101484433662566,
+      "grad_norm": 1.308769941329956,
+      "learning_rate": 4.463055400581586e-05,
+      "loss": 0.6667,
+      "step": 7688
+    },
+    {
+      "epoch": 0.11146248483798302,
+      "grad_norm": 1.3579630851745605,
+      "learning_rate": 4.4578440237722374e-05,
+      "loss": 0.6621,
+      "step": 7719
+    },
+    {
+      "epoch": 0.11191012533934037,
+      "grad_norm": 1.1285645961761475,
+      "learning_rate": 4.452610552959183e-05,
+      "loss": 0.6597,
+      "step": 7750
+    },
+    {
+      "epoch": 0.11235776584069775,
+      "grad_norm": 1.1144675016403198,
+      "learning_rate": 4.447355047201428e-05,
+      "loss": 0.6638,
+      "step": 7781
+    },
+    {
+      "epoch": 0.1128054063420551,
+      "grad_norm": 1.1993658542633057,
+      "learning_rate": 4.4420775658066414e-05,
+      "loss": 0.6704,
+      "step": 7812
+    },
+    {
+      "epoch": 0.11325304684341246,
+      "grad_norm": 1.0608967542648315,
+      "learning_rate": 4.436778168330484e-05,
+      "loss": 0.6573,
+      "step": 7843
+    },
+    {
+      "epoch": 0.11370068734476983,
+      "grad_norm": 1.1210070848464966,
+      "learning_rate": 4.4314569145759353e-05,
+      "loss": 0.6612,
+      "step": 7874
+    },
+    {
+      "epoch": 0.11414832784612719,
+      "grad_norm": 1.2345409393310547,
+      "learning_rate": 4.42611386459262e-05,
+      "loss": 0.65,
+      "step": 7905
+    },
+    {
+      "epoch": 0.11459596834748455,
+      "grad_norm": 1.077025294303894,
+      "learning_rate": 4.420749078676133e-05,
+      "loss": 0.6595,
+      "step": 7936
+    },
+    {
+      "epoch": 0.1150436088488419,
+      "grad_norm": 1.2079277038574219,
+      "learning_rate": 4.4153626173673516e-05,
+      "loss": 0.6442,
+      "step": 7967
+    },
+    {
+      "epoch": 0.11549124935019928,
+      "grad_norm": 1.6710035800933838,
+      "learning_rate": 4.409954541451762e-05,
+      "loss": 0.663,
+      "step": 7998
+    },
+    {
+      "epoch": 0.11593888985155663,
+      "grad_norm": 1.3124401569366455,
+      "learning_rate": 4.404524911958764e-05,
+      "loss": 0.6512,
+      "step": 8029
+    },
+    {
+      "epoch": 0.11638653035291399,
+      "grad_norm": 1.644904375076294,
+      "learning_rate": 4.399073790160989e-05,
+      "loss": 0.6587,
+      "step": 8060
+    },
+    {
+      "epoch": 0.11683417085427136,
+      "grad_norm": 1.181624174118042,
+      "learning_rate": 4.393601237573607e-05,
+      "loss": 0.653,
+      "step": 8091
+    },
+    {
+      "epoch": 0.11728181135562872,
+      "grad_norm": 1.4587918519973755,
+      "learning_rate": 4.388107315953628e-05,
+      "loss": 0.675,
+      "step": 8122
+    },
+    {
+      "epoch": 0.11772945185698608,
+      "grad_norm": 1.2147635221481323,
+      "learning_rate": 4.382592087299212e-05,
+      "loss": 0.6521,
+      "step": 8153
+    },
+    {
+      "epoch": 0.11817709235834344,
+      "grad_norm": 1.0448981523513794,
+      "learning_rate": 4.377055613848964e-05,
+      "loss": 0.6541,
+      "step": 8184
+    },
+    {
+      "epoch": 0.11862473285970081,
+      "grad_norm": 1.4482290744781494,
+      "learning_rate": 4.3714979580812355e-05,
+      "loss": 0.6563,
+      "step": 8215
+    },
+    {
+      "epoch": 0.11907237336105816,
+      "grad_norm": 1.1621575355529785,
+      "learning_rate": 4.365919182713416e-05,
+      "loss": 0.656,
+      "step": 8246
+    },
+    {
+      "epoch": 0.11952001386241552,
+      "grad_norm": 1.1643873453140259,
+      "learning_rate": 4.360319350701226e-05,
+      "loss": 0.6547,
+      "step": 8277
+    },
+    {
+      "epoch": 0.1199676543637729,
+      "grad_norm": 1.4016129970550537,
+      "learning_rate": 4.3546985252380115e-05,
+      "loss": 0.6582,
+      "step": 8308
+    },
+    {
+      "epoch": 0.12041529486513025,
+      "grad_norm": 1.4023685455322266,
+      "learning_rate": 4.349056769754021e-05,
+      "loss": 0.6621,
+      "step": 8339
+    },
+    {
+      "epoch": 0.12086293536648761,
+      "grad_norm": 1.3020285367965698,
+      "learning_rate": 4.3433941479156994e-05,
+      "loss": 0.6674,
+      "step": 8370
+    },
+    {
+      "epoch": 0.12131057586784497,
+      "grad_norm": 1.2162435054779053,
+      "learning_rate": 4.3377107236249647e-05,
+      "loss": 0.6614,
+      "step": 8401
+    },
+    {
+      "epoch": 0.12175821636920234,
+      "grad_norm": 1.1956969499588013,
+      "learning_rate": 4.332006561018488e-05,
+      "loss": 0.6557,
+      "step": 8432
+    },
+    {
+      "epoch": 0.1222058568705597,
+      "grad_norm": 1.1723664999008179,
+      "learning_rate": 4.3262817244669683e-05,
+      "loss": 0.6633,
+      "step": 8463
+    },
+    {
+      "epoch": 0.12265349737191705,
+      "grad_norm": 1.113020658493042,
+      "learning_rate": 4.3205362785744083e-05,
+      "loss": 0.6577,
+      "step": 8494
+    },
+    {
+      "epoch": 0.12310113787327442,
+      "grad_norm": 1.2453004121780396,
+      "learning_rate": 4.314770288177384e-05,
+      "loss": 0.6544,
+      "step": 8525
+    },
+    {
+      "epoch": 0.12354877837463178,
+      "grad_norm": 1.1493890285491943,
+      "learning_rate": 4.308983818344313e-05,
+      "loss": 0.6533,
+      "step": 8556
+    },
+    {
+      "epoch": 0.12399641887598914,
+      "grad_norm": 1.4172496795654297,
+      "learning_rate": 4.3031769343747206e-05,
+      "loss": 0.6542,
+      "step": 8587
+    },
+    {
+      "epoch": 0.1244440593773465,
+      "grad_norm": 1.1840728521347046,
+      "learning_rate": 4.297349701798505e-05,
+      "loss": 0.6476,
+      "step": 8618
+    },
+    {
+      "epoch": 0.12489169987870387,
+      "grad_norm": 1.3720282316207886,
+      "learning_rate": 4.2915021863751916e-05,
+      "loss": 0.6446,
+      "step": 8649
+    },
+    {
+      "epoch": 0.12533934038006123,
+      "grad_norm": 1.1705596446990967,
+      "learning_rate": 4.285634454093198e-05,
+      "loss": 0.6537,
+      "step": 8680
+    },
+    {
+      "epoch": 0.1257869808814186,
+      "grad_norm": 1.0790083408355713,
+      "learning_rate": 4.279746571169086e-05,
+      "loss": 0.6543,
+      "step": 8711
+    },
+    {
+      "epoch": 0.12623462138277594,
+      "grad_norm": 1.1207470893859863,
+      "learning_rate": 4.2738386040468136e-05,
+      "loss": 0.6468,
+      "step": 8742
+    },
+    {
+      "epoch": 0.1266822618841333,
+      "grad_norm": 1.1123065948486328,
+      "learning_rate": 4.2679106193969866e-05,
+      "loss": 0.6596,
+      "step": 8773
+    },
+    {
+      "epoch": 0.12712990238549068,
+      "grad_norm": 1.1579636335372925,
+      "learning_rate": 4.261962684116106e-05,
+      "loss": 0.6458,
+      "step": 8804
+    },
+    {
+      "epoch": 0.12757754288684803,
+      "grad_norm": 1.3112802505493164,
+      "learning_rate": 4.2559948653258145e-05,
+      "loss": 0.6483,
+      "step": 8835
+    },
+    {
+      "epoch": 0.1280251833882054,
+      "grad_norm": 1.1104832887649536,
+      "learning_rate": 4.250007230372134e-05,
+      "loss": 0.645,
+      "step": 8866
+    },
+    {
+      "epoch": 0.12847282388956274,
+      "grad_norm": 1.0218713283538818,
+      "learning_rate": 4.2439998468247126e-05,
+      "loss": 0.6519,
+      "step": 8897
+    },
+    {
+      "epoch": 0.12892046439092011,
+      "grad_norm": 1.0053678750991821,
+      "learning_rate": 4.2379727824760566e-05,
+      "loss": 0.6468,
+      "step": 8928
+    },
+    {
+      "epoch": 0.12936810489227749,
+      "grad_norm": 1.410933017730713,
+      "learning_rate": 4.231926105340768e-05,
+      "loss": 0.6573,
+      "step": 8959
+    },
+    {
+      "epoch": 0.12981574539363483,
+      "grad_norm": 1.5001798868179321,
+      "learning_rate": 4.225859883654776e-05,
+      "loss": 0.6483,
+      "step": 8990
+    },
+    {
+      "epoch": 0.1302633858949922,
+      "grad_norm": 1.112316608428955,
+      "learning_rate": 4.219774185874569e-05,
+      "loss": 0.6483,
+      "step": 9021
+    },
+    {
+      "epoch": 0.13071102639634957,
+      "grad_norm": 1.527464747428894,
+      "learning_rate": 4.213669080676418e-05,
+      "loss": 0.6512,
+      "step": 9052
+    },
+    {
+      "epoch": 0.13115866689770692,
+      "grad_norm": 1.1075704097747803,
+      "learning_rate": 4.2075446369556056e-05,
+      "loss": 0.6577,
+      "step": 9083
+    },
+    {
+      "epoch": 0.1316063073990643,
+      "grad_norm": 0.9589399099349976,
+      "learning_rate": 4.201400923825648e-05,
+      "loss": 0.642,
+      "step": 9114
+    },
+    {
+      "epoch": 0.13205394790042166,
+      "grad_norm": 1.186531901359558,
+      "learning_rate": 4.195238010617511e-05,
+      "loss": 0.6553,
+      "step": 9145
+    },
+    {
+      "epoch": 0.132501588401779,
+      "grad_norm": 1.1176280975341797,
+      "learning_rate": 4.1890559668788344e-05,
+      "loss": 0.6483,
+      "step": 9176
+    },
+    {
+      "epoch": 0.13294922890313637,
+      "grad_norm": 1.4222681522369385,
+      "learning_rate": 4.1828548623731405e-05,
+      "loss": 0.6462,
+      "step": 9207
+    },
+    {
+      "epoch": 0.13339686940449375,
+      "grad_norm": 1.1606040000915527,
+      "learning_rate": 4.1766347670790506e-05,
+      "loss": 0.6514,
+      "step": 9238
+    },
+    {
+      "epoch": 0.1338445099058511,
+      "grad_norm": 1.313774585723877,
+      "learning_rate": 4.170395751189495e-05,
+      "loss": 0.6422,
+      "step": 9269
+    },
+    {
+      "epoch": 0.13429215040720846,
+      "grad_norm": 1.1994171142578125,
+      "learning_rate": 4.164137885110921e-05,
+      "loss": 0.6444,
+      "step": 9300
+    },
+    {
+      "epoch": 0.1347397909085658,
+      "grad_norm": 0.9376353025436401,
+      "learning_rate": 4.157861239462495e-05,
+      "loss": 0.6436,
+      "step": 9331
+    },
+    {
+      "epoch": 0.13518743140992318,
+      "grad_norm": 1.0350178480148315,
+      "learning_rate": 4.1515658850753114e-05,
+      "loss": 0.6447,
+      "step": 9362
+    },
+    {
+      "epoch": 0.13563507191128055,
+      "grad_norm": 1.3630082607269287,
+      "learning_rate": 4.145251892991588e-05,
+      "loss": 0.6427,
+      "step": 9393
+    },
+    {
+      "epoch": 0.1360827124126379,
+      "grad_norm": 1.0362364053726196,
+      "learning_rate": 4.138919334463868e-05,
+      "loss": 0.6443,
+      "step": 9424
+    },
+    {
+      "epoch": 0.13653035291399526,
+      "grad_norm": 1.1442211866378784,
+      "learning_rate": 4.1325682809542124e-05,
+      "loss": 0.6523,
+      "step": 9455
+    },
+    {
+      "epoch": 0.13697799341535263,
+      "grad_norm": 1.4196938276290894,
+      "learning_rate": 4.126198804133398e-05,
+      "loss": 0.6501,
+      "step": 9486
+    },
+    {
+      "epoch": 0.13742563391670998,
+      "grad_norm": 1.3853130340576172,
+      "learning_rate": 4.1198109758801055e-05,
+      "loss": 0.6431,
+      "step": 9517
+    },
+    {
+      "epoch": 0.13787327441806735,
+      "grad_norm": 1.0350273847579956,
+      "learning_rate": 4.113404868280107e-05,
+      "loss": 0.6436,
+      "step": 9548
+    },
+    {
+      "epoch": 0.13832091491942472,
+      "grad_norm": 1.0520857572555542,
+      "learning_rate": 4.106980553625457e-05,
+      "loss": 0.6436,
+      "step": 9579
+    },
+    {
+      "epoch": 0.13876855542078206,
+      "grad_norm": 1.127038836479187,
+      "learning_rate": 4.100538104413674e-05,
+      "loss": 0.639,
+      "step": 9610
+    },
+    {
+      "epoch": 0.13921619592213944,
+      "grad_norm": 1.1070880889892578,
+      "learning_rate": 4.09407759334692e-05,
+      "loss": 0.6366,
+      "step": 9641
+    },
+    {
+      "epoch": 0.1396638364234968,
+      "grad_norm": 1.3045605421066284,
+      "learning_rate": 4.087599093331186e-05,
+      "loss": 0.6496,
+      "step": 9672
+    },
+    {
+      "epoch": 0.14011147692485415,
+      "grad_norm": 1.234647512435913,
+      "learning_rate": 4.081102677475462e-05,
+      "loss": 0.6383,
+      "step": 9703
+    },
+    {
+      "epoch": 0.14055911742621152,
+      "grad_norm": 1.1154453754425049,
+      "learning_rate": 4.0745884190909194e-05,
+      "loss": 0.6454,
+      "step": 9734
+    },
+    {
+      "epoch": 0.14100675792756887,
+      "grad_norm": 1.2422186136245728,
+      "learning_rate": 4.0680563916900796e-05,
+      "loss": 0.6404,
+      "step": 9765
+    },
+    {
+      "epoch": 0.14145439842892624,
+      "grad_norm": 1.2128278017044067,
+      "learning_rate": 4.0615066689859815e-05,
+      "loss": 0.6376,
+      "step": 9796
+    },
+    {
+      "epoch": 0.1419020389302836,
+      "grad_norm": 1.3140804767608643,
+      "learning_rate": 4.0549393248913584e-05,
+      "loss": 0.6316,
+      "step": 9827
+    },
+    {
+      "epoch": 0.14234967943164095,
+      "grad_norm": 1.9198187589645386,
+      "learning_rate": 4.048354433517794e-05,
+      "loss": 0.6383,
+      "step": 9858
+    },
+    {
+      "epoch": 0.14279731993299832,
+      "grad_norm": 1.144679307937622,
+      "learning_rate": 4.0417520691748916e-05,
+      "loss": 0.6383,
+      "step": 9889
+    },
+    {
+      "epoch": 0.1432449604343557,
+      "grad_norm": 1.1679338216781616,
+      "learning_rate": 4.035132306369438e-05,
+      "loss": 0.6414,
+      "step": 9920
+    },
+    {
+      "epoch": 0.14369260093571304,
+      "grad_norm": 0.9563717246055603,
+      "learning_rate": 4.028495219804555e-05,
+      "loss": 0.6327,
+      "step": 9951
+    },
+    {
+      "epoch": 0.1441402414370704,
+      "grad_norm": 1.277036428451538,
+      "learning_rate": 4.021840884378864e-05,
+      "loss": 0.6365,
+      "step": 9982
+    },
+    {
+      "epoch": 0.14458788193842778,
+      "grad_norm": 0.9835182428359985,
+      "learning_rate": 4.015169375185633e-05,
+      "loss": 0.638,
+      "step": 10013
+    },
+    {
+      "epoch": 0.14503552243978513,
+      "grad_norm": 1.090118646621704,
+      "learning_rate": 4.0084807675119396e-05,
+      "loss": 0.6437,
+      "step": 10044
+    },
+    {
+      "epoch": 0.1454831629411425,
+      "grad_norm": 1.1823488473892212,
+      "learning_rate": 4.0017751368378106e-05,
+      "loss": 0.6326,
+      "step": 10075
+    },
+    {
+      "epoch": 0.14593080344249987,
+      "grad_norm": 1.070318341255188,
+      "learning_rate": 3.995052558835377e-05,
+      "loss": 0.6362,
+      "step": 10106
+    },
+    {
+      "epoch": 0.1463784439438572,
+      "grad_norm": 1.2451491355895996,
+      "learning_rate": 3.988313109368017e-05,
+      "loss": 0.6388,
+      "step": 10137
+    },
+    {
+      "epoch": 0.14682608444521458,
+      "grad_norm": 1.2417365312576294,
+      "learning_rate": 3.981556864489504e-05,
+      "loss": 0.6309,
+      "step": 10168
+    },
+    {
+      "epoch": 0.14727372494657193,
+      "grad_norm": 1.251518726348877,
+      "learning_rate": 3.974783900443142e-05,
+      "loss": 0.6365,
+      "step": 10199
+    },
+    {
+      "epoch": 0.1477213654479293,
+      "grad_norm": 1.359750747680664,
+      "learning_rate": 3.9679942936609095e-05,
+      "loss": 0.6386,
+      "step": 10230
+    },
+    {
+      "epoch": 0.14816900594928667,
+      "grad_norm": 1.1073262691497803,
+      "learning_rate": 3.961188120762596e-05,
+      "loss": 0.635,
+      "step": 10261
+    },
+    {
+      "epoch": 0.148616646450644,
+      "grad_norm": 0.9850608706474304,
+      "learning_rate": 3.954365458554938e-05,
+      "loss": 0.6389,
+      "step": 10292
+    },
+    {
+      "epoch": 0.14906428695200138,
+      "grad_norm": 1.2984429597854614,
+      "learning_rate": 3.947526384030751e-05,
+      "loss": 0.6317,
+      "step": 10323
+    },
+    {
+      "epoch": 0.14951192745335876,
+      "grad_norm": 1.1622575521469116,
+      "learning_rate": 3.9406709743680624e-05,
+      "loss": 0.6393,
+      "step": 10354
+    },
+    {
+      "epoch": 0.1499595679547161,
+      "grad_norm": 1.0856871604919434,
+      "learning_rate": 3.9337993069292366e-05,
+      "loss": 0.6351,
+      "step": 10385
+    },
+    {
+      "epoch": 0.15040720845607347,
+      "grad_norm": 1.0153882503509521,
+      "learning_rate": 3.926911459260109e-05,
+      "loss": 0.6282,
+      "step": 10416
+    },
+    {
+      "epoch": 0.15085484895743084,
+      "grad_norm": 1.2039254903793335,
+      "learning_rate": 3.920007509089102e-05,
+      "loss": 0.6365,
+      "step": 10447
+    },
+    {
+      "epoch": 0.1513024894587882,
+      "grad_norm": 1.1179555654525757,
+      "learning_rate": 3.913087534326357e-05,
+      "loss": 0.6311,
+      "step": 10478
+    },
+    {
+      "epoch": 0.15175012996014556,
+      "grad_norm": 1.090903639793396,
+      "learning_rate": 3.9061516130628475e-05,
+      "loss": 0.6401,
+      "step": 10509
+    },
+    {
+      "epoch": 0.15219777046150293,
+      "grad_norm": 0.9228240251541138,
+      "learning_rate": 3.8991998235695025e-05,
+      "loss": 0.6323,
+      "step": 10540
+    },
+    {
+      "epoch": 0.15264541096286027,
+      "grad_norm": 1.0772743225097656,
+      "learning_rate": 3.8922322442963224e-05,
+      "loss": 0.637,
+      "step": 10571
+    },
+    {
+      "epoch": 0.15309305146421764,
+      "grad_norm": 1.0854771137237549,
+      "learning_rate": 3.885248953871491e-05,
+      "loss": 0.6375,
+      "step": 10602
+    },
+    {
+      "epoch": 0.153540691965575,
+      "grad_norm": 1.3902987241744995,
+      "learning_rate": 3.8782500311004915e-05,
+      "loss": 0.6406,
+      "step": 10633
+    },
+    {
+      "epoch": 0.15398833246693236,
+      "grad_norm": 1.180351734161377,
+      "learning_rate": 3.871235554965218e-05,
+      "loss": 0.626,
+      "step": 10664
+    },
+    {
+      "epoch": 0.15443597296828973,
+      "grad_norm": 1.1136449575424194,
+      "learning_rate": 3.864205604623078e-05,
+      "loss": 0.6395,
+      "step": 10695
+    },
+    {
+      "epoch": 0.15488361346964707,
+      "grad_norm": 1.1770708560943604,
+      "learning_rate": 3.857160259406107e-05,
+      "loss": 0.633,
+      "step": 10726
+    },
+    {
+      "epoch": 0.15533125397100445,
+      "grad_norm": 1.1615066528320312,
+      "learning_rate": 3.8500995988200674e-05,
+      "loss": 0.632,
+      "step": 10757
+    },
+    {
+      "epoch": 0.15577889447236182,
+      "grad_norm": 1.2898380756378174,
+      "learning_rate": 3.843023702543556e-05,
+      "loss": 0.6332,
+      "step": 10788
+    },
+    {
+      "epoch": 0.15622653497371916,
+      "grad_norm": 1.0051672458648682,
+      "learning_rate": 3.8359326504270984e-05,
+      "loss": 0.6353,
+      "step": 10819
+    },
+    {
+      "epoch": 0.15667417547507653,
+      "grad_norm": 0.9514272212982178,
+      "learning_rate": 3.828826522492255e-05,
+      "loss": 0.6383,
+      "step": 10850
+    },
+    {
+      "epoch": 0.1571218159764339,
+      "grad_norm": 1.2570873498916626,
+      "learning_rate": 3.821705398930713e-05,
+      "loss": 0.6308,
+      "step": 10881
+    },
+    {
+      "epoch": 0.15756945647779125,
+      "grad_norm": 0.992323637008667,
+      "learning_rate": 3.814569360103385e-05,
+      "loss": 0.6303,
+      "step": 10912
+    },
+    {
+      "epoch": 0.15801709697914862,
+      "grad_norm": 1.255265474319458,
+      "learning_rate": 3.807418486539499e-05,
+      "loss": 0.6349,
+      "step": 10943
+    },
+    {
+      "epoch": 0.158464737480506,
+      "grad_norm": 1.1066702604293823,
+      "learning_rate": 3.80025285893569e-05,
+      "loss": 0.6317,
+      "step": 10974
+    },
+    {
+      "epoch": 0.15891237798186333,
+      "grad_norm": 1.178690791130066,
+      "learning_rate": 3.793072558155093e-05,
+      "loss": 0.639,
+      "step": 11005
+    },
+    {
+      "epoch": 0.1593600184832207,
+      "grad_norm": 1.0850341320037842,
+      "learning_rate": 3.785877665226426e-05,
+      "loss": 0.6375,
+      "step": 11036
+    },
+    {
+      "epoch": 0.15980765898457805,
+      "grad_norm": 1.1378651857376099,
+      "learning_rate": 3.778668261343079e-05,
+      "loss": 0.6287,
+      "step": 11067
+    },
+    {
+      "epoch": 0.16025529948593542,
+      "grad_norm": 1.07688570022583,
+      "learning_rate": 3.771444427862192e-05,
+      "loss": 0.6261,
+      "step": 11098
+    },
+    {
+      "epoch": 0.1607029399872928,
+      "grad_norm": 1.108269453048706,
+      "learning_rate": 3.7642062463037465e-05,
+      "loss": 0.6352,
+      "step": 11129
+    },
+    {
+      "epoch": 0.16115058048865014,
+      "grad_norm": 1.2582095861434937,
+      "learning_rate": 3.7569537983496373e-05,
+      "loss": 0.6312,
+      "step": 11160
+    },
+    {
+      "epoch": 0.1615982209900075,
+      "grad_norm": 0.9823578000068665,
+      "learning_rate": 3.749687165842753e-05,
+      "loss": 0.6253,
+      "step": 11191
+    },
+    {
+      "epoch": 0.16204586149136488,
+      "grad_norm": 1.3922805786132812,
+      "learning_rate": 3.7424064307860536e-05,
+      "loss": 0.6279,
+      "step": 11222
+    },
+    {
+      "epoch": 0.16249350199272222,
+      "grad_norm": 1.2210962772369385,
+      "learning_rate": 3.735111675341645e-05,
+      "loss": 0.6357,
+      "step": 11253
+    },
+    {
+      "epoch": 0.1629411424940796,
+      "grad_norm": 1.0463316440582275,
+      "learning_rate": 3.7278029818298524e-05,
+      "loss": 0.6332,
+      "step": 11284
+    },
+    {
+      "epoch": 0.16338878299543697,
+      "grad_norm": 1.165583848953247,
+      "learning_rate": 3.720480432728287e-05,
+      "loss": 0.627,
+      "step": 11315
+    },
+    {
+      "epoch": 0.1638364234967943,
+      "grad_norm": 1.0995306968688965,
+      "learning_rate": 3.71314411067092e-05,
+      "loss": 0.6283,
+      "step": 11346
+    },
+    {
+      "epoch": 0.16428406399815168,
+      "grad_norm": 1.0279158353805542,
+      "learning_rate": 3.70579409844715e-05,
+      "loss": 0.6287,
+      "step": 11377
+    },
+    {
+      "epoch": 0.16473170449950905,
+      "grad_norm": 1.51092529296875,
+      "learning_rate": 3.698430479000865e-05,
+      "loss": 0.6261,
+      "step": 11408
+    },
+    {
+      "epoch": 0.1651793450008664,
+      "grad_norm": 1.020936369895935,
+      "learning_rate": 3.691053335429509e-05,
+      "loss": 0.6327,
+      "step": 11439
+    },
+    {
+      "epoch": 0.16562698550222377,
+      "grad_norm": 1.0198683738708496,
+      "learning_rate": 3.683662750983147e-05,
+      "loss": 0.6422,
+      "step": 11470
+    },
+    {
+      "epoch": 0.1660746260035811,
+      "grad_norm": 1.2650995254516602,
+      "learning_rate": 3.676258809063518e-05,
+      "loss": 0.6354,
+      "step": 11501
+    },
+    {
+      "epoch": 0.16652226650493848,
+      "grad_norm": 1.1653568744659424,
+      "learning_rate": 3.6688415932231004e-05,
+      "loss": 0.6325,
+      "step": 11532
+    },
+    {
+      "epoch": 0.16696990700629585,
+      "grad_norm": 1.1461430788040161,
+      "learning_rate": 3.661411187164166e-05,
+      "loss": 0.6251,
+      "step": 11563
+    },
+    {
+      "epoch": 0.1674175475076532,
+      "grad_norm": 1.2535974979400635,
+      "learning_rate": 3.65396767473784e-05,
+      "loss": 0.6245,
+      "step": 11594
+    },
+    {
+      "epoch": 0.16786518800901057,
+      "grad_norm": 1.115191102027893,
+      "learning_rate": 3.6465111399431465e-05,
+      "loss": 0.6294,
+      "step": 11625
+    },
+    {
+      "epoch": 0.16831282851036794,
+      "grad_norm": 1.0482964515686035,
+      "learning_rate": 3.6390416669260674e-05,
+      "loss": 0.6247,
+      "step": 11656
+    },
+    {
+      "epoch": 0.16876046901172528,
+      "grad_norm": 1.1431951522827148,
+      "learning_rate": 3.63155933997859e-05,
+      "loss": 0.63,
+      "step": 11687
+    },
+    {
+      "epoch": 0.16920810951308266,
+      "grad_norm": 1.0254175662994385,
+      "learning_rate": 3.624064243537758e-05,
+      "loss": 0.6212,
+      "step": 11718
+    },
+    {
+      "epoch": 0.16965575001444003,
+      "grad_norm": 0.9481080174446106,
+      "learning_rate": 3.616556462184716e-05,
+      "loss": 0.6278,
+      "step": 11749
+    },
+    {
+      "epoch": 0.17010339051579737,
+      "grad_norm": 1.118394374847412,
+      "learning_rate": 3.609036080643755e-05,
+      "loss": 0.6244,
+      "step": 11780
+    },
+    {
+      "epoch": 0.17055103101715474,
+      "grad_norm": 1.1592167615890503,
+      "learning_rate": 3.60150318378136e-05,
+      "loss": 0.621,
+      "step": 11811
+    },
+    {
+      "epoch": 0.1709986715185121,
+      "grad_norm": 0.9984686374664307,
+      "learning_rate": 3.5939578566052465e-05,
+      "loss": 0.6319,
+      "step": 11842
+    },
+    {
+      "epoch": 0.17144631201986946,
+      "grad_norm": 1.0091164112091064,
+      "learning_rate": 3.586400184263408e-05,
+      "loss": 0.6345,
+      "step": 11873
+    },
+    {
+      "epoch": 0.17189395252122683,
+      "grad_norm": 1.0355888605117798,
+      "learning_rate": 3.578830252043148e-05,
+      "loss": 0.6171,
+      "step": 11904
+    },
+    {
+      "epoch": 0.17234159302258417,
+      "grad_norm": 1.1437592506408691,
+      "learning_rate": 3.571248145370125e-05,
+      "loss": 0.6201,
+      "step": 11935
+    },
+    {
+      "epoch": 0.17278923352394154,
+      "grad_norm": 0.9440962672233582,
+      "learning_rate": 3.5636539498073794e-05,
+      "loss": 0.6236,
+      "step": 11966
+    },
+    {
+      "epoch": 0.17323687402529891,
+      "grad_norm": 0.9761082530021667,
+      "learning_rate": 3.556047751054378e-05,
+      "loss": 0.6291,
+      "step": 11997
+    },
+    {
+      "epoch": 0.17368451452665626,
+      "grad_norm": 1.1858127117156982,
+      "learning_rate": 3.548429634946039e-05,
+      "loss": 0.6299,
+      "step": 12028
+    },
+    {
+      "epoch": 0.17413215502801363,
+      "grad_norm": 1.0180195569992065,
+      "learning_rate": 3.540799687451768e-05,
+      "loss": 0.6227,
+      "step": 12059
+    },
+    {
+      "epoch": 0.174579795529371,
+      "grad_norm": 0.9683852195739746,
+      "learning_rate": 3.533157994674485e-05,
+      "loss": 0.626,
+      "step": 12090
+    },
+    {
+      "epoch": 0.17502743603072834,
+      "grad_norm": 1.0338289737701416,
+      "learning_rate": 3.5255046428496546e-05,
+      "loss": 0.6377,
+      "step": 12121
+    },
+    {
+      "epoch": 0.17547507653208572,
+      "grad_norm": 1.1238298416137695,
+      "learning_rate": 3.517839718344311e-05,
+      "loss": 0.6338,
+      "step": 12152
+    },
+    {
+      "epoch": 0.1759227170334431,
+      "grad_norm": 1.0541973114013672,
+      "learning_rate": 3.510163307656086e-05,
+      "loss": 0.6222,
+      "step": 12183
+    },
+    {
+      "epoch": 0.17637035753480043,
+      "grad_norm": 1.1677592992782593,
+      "learning_rate": 3.5024754974122324e-05,
+      "loss": 0.6256,
+      "step": 12214
+    },
+    {
+      "epoch": 0.1768179980361578,
+      "grad_norm": 1.0191985368728638,
+      "learning_rate": 3.494776374368643e-05,
+      "loss": 0.6301,
+      "step": 12245
+    },
+    {
+      "epoch": 0.17726563853751517,
+      "grad_norm": 1.1535918712615967,
+      "learning_rate": 3.4870660254088724e-05,
+      "loss": 0.6253,
+      "step": 12276
+    },
+    {
+      "epoch": 0.17771327903887252,
+      "grad_norm": 1.0887985229492188,
+      "learning_rate": 3.479344537543164e-05,
+      "loss": 0.6335,
+      "step": 12307
+    },
+    {
+      "epoch": 0.1781609195402299,
+      "grad_norm": 1.010688066482544,
+      "learning_rate": 3.4716119979074565e-05,
+      "loss": 0.6251,
+      "step": 12338
+    },
+    {
+      "epoch": 0.17860856004158723,
+      "grad_norm": 0.9745127558708191,
+      "learning_rate": 3.463868493762412e-05,
+      "loss": 0.6241,
+      "step": 12369
+    },
+    {
+      "epoch": 0.1790562005429446,
+      "grad_norm": 1.0414716005325317,
+      "learning_rate": 3.456114112492418e-05,
+      "loss": 0.6237,
+      "step": 12400
+    },
+    {
+      "epoch": 0.17950384104430198,
+      "grad_norm": 1.0457465648651123,
+      "learning_rate": 3.4483489416046164e-05,
+      "loss": 0.6258,
+      "step": 12431
+    },
+    {
+      "epoch": 0.17995148154565932,
+      "grad_norm": 1.0389049053192139,
+      "learning_rate": 3.440573068727905e-05,
+      "loss": 0.6262,
+      "step": 12462
+    },
+    {
+      "epoch": 0.1803991220470167,
+      "grad_norm": 1.255600094795227,
+      "learning_rate": 3.4327865816119495e-05,
+      "loss": 0.6305,
+      "step": 12493
+    },
+    {
+      "epoch": 0.18084676254837406,
+      "grad_norm": 1.0340358018875122,
+      "learning_rate": 3.4249895681262025e-05,
+      "loss": 0.6212,
+      "step": 12524
+    },
+    {
+      "epoch": 0.1812944030497314,
+      "grad_norm": 1.0317034721374512,
+      "learning_rate": 3.417182116258899e-05,
+      "loss": 0.6279,
+      "step": 12555
+    },
+    {
+      "epoch": 0.18174204355108878,
+      "grad_norm": 1.1320221424102783,
+      "learning_rate": 3.409364314116074e-05,
+      "loss": 0.631,
+      "step": 12586
+    },
+    {
+      "epoch": 0.18218968405244615,
+      "grad_norm": 0.9674787521362305,
+      "learning_rate": 3.401536249920559e-05,
+      "loss": 0.627,
+      "step": 12617
+    },
+    {
+      "epoch": 0.1826373245538035,
+      "grad_norm": 0.9329623579978943,
+      "learning_rate": 3.393698012010998e-05,
+      "loss": 0.6244,
+      "step": 12648
+    },
+    {
+      "epoch": 0.18308496505516086,
+      "grad_norm": 1.2081501483917236,
+      "learning_rate": 3.385849688840839e-05,
+      "loss": 0.6295,
+      "step": 12679
+    },
+    {
+      "epoch": 0.18353260555651824,
+      "grad_norm": 0.9842090010643005,
+      "learning_rate": 3.3779913689773414e-05,
+      "loss": 0.6276,
+      "step": 12710
+    },
+    {
+      "epoch": 0.18398024605787558,
+      "grad_norm": 1.1417752504348755,
+      "learning_rate": 3.370123141100578e-05,
+      "loss": 0.6266,
+      "step": 12741
+    },
+    {
+      "epoch": 0.18442788655923295,
+      "grad_norm": 0.9693592190742493,
+      "learning_rate": 3.3622450940024305e-05,
+      "loss": 0.6245,
+      "step": 12772
+    },
+    {
+      "epoch": 0.1848755270605903,
+      "grad_norm": 1.1920111179351807,
+      "learning_rate": 3.35435731658559e-05,
+      "loss": 0.6227,
+      "step": 12803
+    },
+    {
+      "epoch": 0.18532316756194767,
+      "grad_norm": 0.9865401387214661,
+      "learning_rate": 3.346459897862552e-05,
+      "loss": 0.6287,
+      "step": 12834
+    },
+    {
+      "epoch": 0.18577080806330504,
+      "grad_norm": 0.9544184803962708,
+      "learning_rate": 3.338552926954613e-05,
+      "loss": 0.6236,
+      "step": 12865
+    },
+    {
+      "epoch": 0.18621844856466238,
+      "grad_norm": 1.0202548503875732,
+      "learning_rate": 3.330636493090868e-05,
+      "loss": 0.6269,
+      "step": 12896
+    },
+    {
+      "epoch": 0.18666608906601975,
+      "grad_norm": 1.1385433673858643,
+      "learning_rate": 3.322710685607193e-05,
+      "loss": 0.6385,
+      "step": 12927
+    },
+    {
+      "epoch": 0.18711372956737712,
+      "grad_norm": 1.0102901458740234,
+      "learning_rate": 3.314775593945251e-05,
+      "loss": 0.6241,
+      "step": 12958
+    },
+    {
+      "epoch": 0.18756137006873447,
+      "grad_norm": 0.9830989241600037,
+      "learning_rate": 3.3068313076514714e-05,
+      "loss": 0.6243,
+      "step": 12989
+    },
+    {
+      "epoch": 0.18800901057009184,
+      "grad_norm": 1.0044376850128174,
+      "learning_rate": 3.298877916376047e-05,
+      "loss": 0.619,
+      "step": 13020
+    },
+    {
+      "epoch": 0.1884566510714492,
+      "grad_norm": 1.0714712142944336,
+      "learning_rate": 3.290915509871915e-05,
+      "loss": 0.6243,
+      "step": 13051
+    },
+    {
+      "epoch": 0.18890429157280655,
+      "grad_norm": 0.9379229545593262,
+      "learning_rate": 3.282944177993753e-05,
+      "loss": 0.6216,
+      "step": 13082
+    },
+    {
+      "epoch": 0.18935193207416393,
+      "grad_norm": 1.2717514038085938,
+      "learning_rate": 3.274964010696957e-05,
+      "loss": 0.6206,
+      "step": 13113
+    },
+    {
+      "epoch": 0.1897995725755213,
+      "grad_norm": 1.1147576570510864,
+      "learning_rate": 3.266975098036629e-05,
+      "loss": 0.6234,
+      "step": 13144
+    },
+    {
+      "epoch": 0.19024721307687864,
+      "grad_norm": 0.9994730949401855,
+      "learning_rate": 3.258977530166562e-05,
+      "loss": 0.6146,
+      "step": 13175
+    },
+    {
+      "epoch": 0.190694853578236,
+      "grad_norm": 1.195367693901062,
+      "learning_rate": 3.250971397338227e-05,
+      "loss": 0.624,
+      "step": 13206
+    },
+    {
+      "epoch": 0.19114249407959336,
+      "grad_norm": 1.0008747577667236,
+      "learning_rate": 3.2429567898997404e-05,
+      "loss": 0.6182,
+      "step": 13237
+    },
+    {
+      "epoch": 0.19159013458095073,
+      "grad_norm": 1.3223299980163574,
+      "learning_rate": 3.234933798294859e-05,
+      "loss": 0.6193,
+      "step": 13268
+    },
+    {
+      "epoch": 0.1920377750823081,
+      "grad_norm": 1.1946437358856201,
+      "learning_rate": 3.2269025130619535e-05,
+      "loss": 0.6201,
+      "step": 13299
+    },
+    {
+      "epoch": 0.19248541558366544,
+      "grad_norm": 1.1597986221313477,
+      "learning_rate": 3.218863024832985e-05,
+      "loss": 0.6212,
+      "step": 13330
+    },
+    {
+      "epoch": 0.1929330560850228,
+      "grad_norm": 0.9518936276435852,
+      "learning_rate": 3.2108154243324864e-05,
+      "loss": 0.6154,
+      "step": 13361
+    },
+    {
+      "epoch": 0.19338069658638019,
+      "grad_norm": 0.890487790107727,
+      "learning_rate": 3.2027598023765345e-05,
+      "loss": 0.6203,
+      "step": 13392
+    },
+    {
+      "epoch": 0.19382833708773753,
+      "grad_norm": 0.9918534755706787,
+      "learning_rate": 3.194696249871729e-05,
+      "loss": 0.6319,
+      "step": 13423
+    },
+    {
+      "epoch": 0.1942759775890949,
+      "grad_norm": 1.1954073905944824,
+      "learning_rate": 3.186624857814164e-05,
+      "loss": 0.619,
+      "step": 13454
+    },
+    {
+      "epoch": 0.19472361809045227,
+      "grad_norm": 1.1521157026290894,
+      "learning_rate": 3.178545717288401e-05,
+      "loss": 0.6326,
+      "step": 13485
+    },
+    {
+      "epoch": 0.19517125859180962,
+      "grad_norm": 1.0131208896636963,
+      "learning_rate": 3.170458919466444e-05,
+      "loss": 0.6234,
+      "step": 13516
+    },
+    {
+      "epoch": 0.195618899093167,
+      "grad_norm": 1.0429494380950928,
+      "learning_rate": 3.1623645556067063e-05,
+      "loss": 0.6146,
+      "step": 13547
+    },
+    {
+      "epoch": 0.19606653959452436,
+      "grad_norm": 0.9586461782455444,
+      "learning_rate": 3.154262717052985e-05,
+      "loss": 0.6192,
+      "step": 13578
+    },
+    {
+      "epoch": 0.1965141800958817,
+      "grad_norm": 0.9385515451431274,
+      "learning_rate": 3.146153495233426e-05,
+      "loss": 0.6186,
+      "step": 13609
+    },
+    {
+      "epoch": 0.19696182059723907,
+      "grad_norm": 0.9109722375869751,
+      "learning_rate": 3.1380369816594944e-05,
+      "loss": 0.6223,
+      "step": 13640
+    },
+    {
+      "epoch": 0.19740946109859642,
+      "grad_norm": 1.0564444065093994,
+      "learning_rate": 3.129913267924946e-05,
+      "loss": 0.6235,
+      "step": 13671
+    },
+    {
+      "epoch": 0.1978571015999538,
+      "grad_norm": 1.1656286716461182,
+      "learning_rate": 3.121782445704782e-05,
+      "loss": 0.6176,
+      "step": 13702
+    },
+    {
+      "epoch": 0.19830474210131116,
+      "grad_norm": 1.1301069259643555,
+      "learning_rate": 3.11364460675423e-05,
+      "loss": 0.6253,
+      "step": 13733
+    },
+    {
+      "epoch": 0.1987523826026685,
+      "grad_norm": 0.9939395785331726,
+      "learning_rate": 3.1054998429076934e-05,
+      "loss": 0.6223,
+      "step": 13764
+    },
+    {
+      "epoch": 0.19920002310402588,
+      "grad_norm": 1.2881885766983032,
+      "learning_rate": 3.097348246077728e-05,
+      "loss": 0.6177,
+      "step": 13795
+    },
+    {
+      "epoch": 0.19964766360538325,
+      "grad_norm": 1.1002579927444458,
+      "learning_rate": 3.0891899082539924e-05,
+      "loss": 0.6139,
+      "step": 13826
+    },
+    {
+      "epoch": 0.2000953041067406,
+      "grad_norm": 1.045394778251648,
+      "learning_rate": 3.0810249215022233e-05,
+      "loss": 0.6192,
+      "step": 13857
+    },
+    {
+      "epoch": 0.20054294460809796,
+      "grad_norm": 0.9559116959571838,
+      "learning_rate": 3.0728533779631865e-05,
+      "loss": 0.6155,
+      "step": 13888
+    },
+    {
+      "epoch": 0.20099058510945533,
+      "grad_norm": 0.9250887036323547,
+      "learning_rate": 3.064675369851637e-05,
+      "loss": 0.6235,
+      "step": 13919
+    },
+    {
+      "epoch": 0.20143822561081268,
+      "grad_norm": 1.0655368566513062,
+      "learning_rate": 3.056490989455289e-05,
+      "loss": 0.628,
+      "step": 13950
+    },
+    {
+      "epoch": 0.20188586611217005,
+      "grad_norm": 1.07636559009552,
+      "learning_rate": 3.0483003291337596e-05,
+      "loss": 0.6244,
+      "step": 13981
+    },
+    {
+      "epoch": 0.20233350661352742,
+      "grad_norm": 1.050580620765686,
+      "learning_rate": 3.040103481317539e-05,
+      "loss": 0.6222,
+      "step": 14012
+    },
+    {
+      "epoch": 0.20278114711488476,
+      "grad_norm": 1.3754404783248901,
+      "learning_rate": 3.03190053850694e-05,
+      "loss": 0.6151,
+      "step": 14043
+    },
+    {
+      "epoch": 0.20322878761624213,
+      "grad_norm": 1.0527547597885132,
+      "learning_rate": 3.0236915932710573e-05,
+      "loss": 0.6153,
+      "step": 14074
+    },
+    {
+      "epoch": 0.20367642811759948,
+      "grad_norm": 0.9438226819038391,
+      "learning_rate": 3.0154767382467232e-05,
+      "loss": 0.618,
+      "step": 14105
+    },
+    {
+      "epoch": 0.20412406861895685,
+      "grad_norm": 1.0383126735687256,
+      "learning_rate": 3.0072560661374582e-05,
+      "loss": 0.6162,
+      "step": 14136
+    },
+    {
+      "epoch": 0.20457170912031422,
+      "grad_norm": 1.1412239074707031,
+      "learning_rate": 2.999029669712431e-05,
+      "loss": 0.6284,
+      "step": 14167
+    },
+    {
+      "epoch": 0.20501934962167156,
+      "grad_norm": 1.1064159870147705,
+      "learning_rate": 2.990797641805408e-05,
+      "loss": 0.6223,
+      "step": 14198
+    },
+    {
+      "epoch": 0.20546699012302894,
+      "grad_norm": 1.0044069290161133,
+      "learning_rate": 2.982560075313704e-05,
+      "loss": 0.6191,
+      "step": 14229
+    },
+    {
+      "epoch": 0.2059146306243863,
+      "grad_norm": 0.9315604567527771,
+      "learning_rate": 2.9743170631971368e-05,
+      "loss": 0.6207,
+      "step": 14260
+    },
+    {
+      "epoch": 0.20636227112574365,
+      "grad_norm": 0.941224217414856,
+      "learning_rate": 2.9660686984769792e-05,
+      "loss": 0.6207,
+      "step": 14291
+    },
+    {
+      "epoch": 0.20680991162710102,
+      "grad_norm": 1.1239089965820312,
+      "learning_rate": 2.9578150742349047e-05,
+      "loss": 0.6252,
+      "step": 14322
+    },
+    {
+      "epoch": 0.2072575521284584,
+      "grad_norm": 0.9484926462173462,
+      "learning_rate": 2.949556283611942e-05,
+      "loss": 0.6136,
+      "step": 14353
+    },
+    {
+      "epoch": 0.20770519262981574,
+      "grad_norm": 0.9437084197998047,
+      "learning_rate": 2.9412924198074206e-05,
+      "loss": 0.6154,
+      "step": 14384
+    },
+    {
+      "epoch": 0.2081528331311731,
+      "grad_norm": 0.9578093886375427,
+      "learning_rate": 2.9330235760779208e-05,
+      "loss": 0.6191,
+      "step": 14415
+    },
+    {
+      "epoch": 0.20860047363253048,
+      "grad_norm": 1.0657248497009277,
+      "learning_rate": 2.9247498457362188e-05,
+      "loss": 0.6178,
+      "step": 14446
+    },
+    {
+      "epoch": 0.20904811413388782,
+      "grad_norm": 0.853568434715271,
+      "learning_rate": 2.9164713221502373e-05,
+      "loss": 0.6152,
+      "step": 14477
+    },
+    {
+      "epoch": 0.2094957546352452,
+      "grad_norm": 1.0403015613555908,
+      "learning_rate": 2.9081880987419912e-05,
+      "loss": 0.6108,
+      "step": 14508
+    },
+    {
+      "epoch": 0.20994339513660254,
+      "grad_norm": 1.0344171524047852,
+      "learning_rate": 2.8999002689865296e-05,
+      "loss": 0.6155,
+      "step": 14539
+    },
+    {
+      "epoch": 0.2103910356379599,
+      "grad_norm": 1.0755060911178589,
+      "learning_rate": 2.8916079264108852e-05,
+      "loss": 0.6156,
+      "step": 14570
+    },
+    {
+      "epoch": 0.21083867613931728,
+      "grad_norm": 0.8636776208877563,
+      "learning_rate": 2.883311164593017e-05,
+      "loss": 0.6193,
+      "step": 14601
+    },
+    {
+      "epoch": 0.21128631664067463,
+      "grad_norm": 1.0264644622802734,
+      "learning_rate": 2.875010077160754e-05,
+      "loss": 0.6138,
+      "step": 14632
+    },
+    {
+      "epoch": 0.211733957142032,
+      "grad_norm": 1.2590196132659912,
+      "learning_rate": 2.866704757790741e-05,
+      "loss": 0.6202,
+      "step": 14663
+    },
+    {
+      "epoch": 0.21218159764338937,
+      "grad_norm": 1.1028645038604736,
+      "learning_rate": 2.858395300207376e-05,
+      "loss": 0.614,
+      "step": 14694
+    },
+    {
+      "epoch": 0.2126292381447467,
+      "grad_norm": 0.8904405236244202,
+      "learning_rate": 2.8500817981817607e-05,
+      "loss": 0.6152,
+      "step": 14725
+    },
+    {
+      "epoch": 0.21307687864610408,
+      "grad_norm": 0.9810163974761963,
+      "learning_rate": 2.8417643455306336e-05,
+      "loss": 0.6088,
+      "step": 14756
+    },
+    {
+      "epoch": 0.21352451914746146,
+      "grad_norm": 0.9837898015975952,
+      "learning_rate": 2.8334430361153185e-05,
+      "loss": 0.6129,
+      "step": 14787
+    },
+    {
+      "epoch": 0.2139721596488188,
+      "grad_norm": 0.987639844417572,
+      "learning_rate": 2.8251179638406612e-05,
+      "loss": 0.6081,
+      "step": 14818
+    },
+    {
+      "epoch": 0.21441980015017617,
+      "grad_norm": 1.1478586196899414,
+      "learning_rate": 2.8167892226539704e-05,
+      "loss": 0.6146,
+      "step": 14849
+    },
+    {
+      "epoch": 0.21486744065153354,
+      "grad_norm": 1.0885242223739624,
+      "learning_rate": 2.8084569065439588e-05,
+      "loss": 0.6183,
+      "step": 14880
+    },
+    {
+      "epoch": 0.21531508115289089,
+      "grad_norm": 0.9934699535369873,
+      "learning_rate": 2.8001211095396807e-05,
+      "loss": 0.6157,
+      "step": 14911
+    },
+    {
+      "epoch": 0.21576272165424826,
+      "grad_norm": 0.9285492300987244,
+      "learning_rate": 2.791781925709473e-05,
+      "loss": 0.6196,
+      "step": 14942
+    },
+    {
+      "epoch": 0.2162103621556056,
+      "grad_norm": 1.243133783340454,
+      "learning_rate": 2.7834394491598908e-05,
+      "loss": 0.6109,
+      "step": 14973
+    },
+    {
+      "epoch": 0.21665800265696297,
+      "grad_norm": 1.0712559223175049,
+      "learning_rate": 2.7750937740346485e-05,
+      "loss": 0.6268,
+      "step": 15004
+    },
+    {
+      "epoch": 0.21710564315832034,
+      "grad_norm": 1.0762903690338135,
+      "learning_rate": 2.7667449945135564e-05,
+      "loss": 0.6162,
+      "step": 15035
+    },
+    {
+      "epoch": 0.2175532836596777,
+      "grad_norm": 1.043479084968567,
+      "learning_rate": 2.7583932048114557e-05,
+      "loss": 0.6174,
+      "step": 15066
+    },
+    {
+      "epoch": 0.21800092416103506,
+      "grad_norm": 0.9906991720199585,
+      "learning_rate": 2.7500384991771587e-05,
+      "loss": 0.6153,
+      "step": 15097
+    },
+    {
+      "epoch": 0.21844856466239243,
+      "grad_norm": 0.8844815492630005,
+      "learning_rate": 2.7416809718923825e-05,
+      "loss": 0.6113,
+      "step": 15128
+    },
+    {
+      "epoch": 0.21889620516374977,
+      "grad_norm": 1.0258604288101196,
+      "learning_rate": 2.7333207172706864e-05,
+      "loss": 0.6111,
+      "step": 15159
+    },
+    {
+      "epoch": 0.21934384566510715,
+      "grad_norm": 0.8992047309875488,
+      "learning_rate": 2.7249578296564088e-05,
+      "loss": 0.6083,
+      "step": 15190
+    },
+    {
+      "epoch": 0.21979148616646452,
+      "grad_norm": 0.991061806678772,
+      "learning_rate": 2.7165924034235973e-05,
+      "loss": 0.6219,
+      "step": 15221
+    },
+    {
+      "epoch": 0.22023912666782186,
+      "grad_norm": 0.9700108766555786,
+      "learning_rate": 2.708224532974953e-05,
+      "loss": 0.6119,
+      "step": 15252
+    },
+    {
+      "epoch": 0.22068676716917923,
+      "grad_norm": 0.904680609703064,
+      "learning_rate": 2.6998543127407538e-05,
+      "loss": 0.6135,
+      "step": 15283
+    },
+    {
+      "epoch": 0.2211344076705366,
+      "grad_norm": 0.9015173316001892,
+      "learning_rate": 2.6914818371777988e-05,
+      "loss": 0.611,
+      "step": 15314
+    },
+    {
+      "epoch": 0.22158204817189395,
+      "grad_norm": 1.020070195198059,
+      "learning_rate": 2.6831072007683373e-05,
+      "loss": 0.617,
+      "step": 15345
+    },
+    {
+      "epoch": 0.22202968867325132,
+      "grad_norm": 1.0938650369644165,
+      "learning_rate": 2.6747304980190018e-05,
+      "loss": 0.6135,
+      "step": 15376
+    },
+    {
+      "epoch": 0.22247732917460866,
+      "grad_norm": 1.2179347276687622,
+      "learning_rate": 2.6663518234597453e-05,
+      "loss": 0.6108,
+      "step": 15407
+    },
+    {
+      "epoch": 0.22292496967596603,
+      "grad_norm": 0.9314635396003723,
+      "learning_rate": 2.6579712716427696e-05,
+      "loss": 0.6109,
+      "step": 15438
+    },
+    {
+      "epoch": 0.2233726101773234,
+      "grad_norm": 0.9413474798202515,
+      "learning_rate": 2.6495889371414652e-05,
+      "loss": 0.6114,
+      "step": 15469
+    },
+    {
+      "epoch": 0.22382025067868075,
+      "grad_norm": 1.0556674003601074,
+      "learning_rate": 2.6412049145493367e-05,
+      "loss": 0.6114,
+      "step": 15500
+    },
+    {
+      "epoch": 0.22426789118003812,
+      "grad_norm": 0.9029526114463806,
+      "learning_rate": 2.632819298478939e-05,
+      "loss": 0.6152,
+      "step": 15531
+    },
+    {
+      "epoch": 0.2247155316813955,
+      "grad_norm": 1.0554165840148926,
+      "learning_rate": 2.6244321835608105e-05,
+      "loss": 0.6077,
+      "step": 15562
+    },
+    {
+      "epoch": 0.22516317218275284,
+      "grad_norm": 0.9897674918174744,
+      "learning_rate": 2.6160436644424024e-05,
+      "loss": 0.6099,
+      "step": 15593
+    },
+    {
+      "epoch": 0.2256108126841102,
+      "grad_norm": 1.036055326461792,
+      "learning_rate": 2.6076538357870133e-05,
+      "loss": 0.6115,
+      "step": 15624
+    },
+    {
+      "epoch": 0.22605845318546758,
+      "grad_norm": 1.1050103902816772,
+      "learning_rate": 2.5992627922727196e-05,
+      "loss": 0.6132,
+      "step": 15655
+    },
+    {
+      "epoch": 0.22650609368682492,
+      "grad_norm": 1.0429555177688599,
+      "learning_rate": 2.5908706285913066e-05,
+      "loss": 0.6114,
+      "step": 15686
+    },
+    {
+      "epoch": 0.2269537341881823,
+      "grad_norm": 0.8952310681343079,
+      "learning_rate": 2.5824774394472008e-05,
+      "loss": 0.6155,
+      "step": 15717
+    },
+    {
+      "epoch": 0.22740137468953966,
+      "grad_norm": 0.9422932267189026,
+      "learning_rate": 2.5740833195563996e-05,
+      "loss": 0.6115,
+      "step": 15748
+    },
+    {
+      "epoch": 0.227849015190897,
+      "grad_norm": 0.8615415096282959,
+      "learning_rate": 2.5656883636454067e-05,
+      "loss": 0.6147,
+      "step": 15779
+    },
+    {
+      "epoch": 0.22829665569225438,
+      "grad_norm": 1.0953892469406128,
+      "learning_rate": 2.557292666450159e-05,
+      "loss": 0.6141,
+      "step": 15810
+    },
+    {
+      "epoch": 0.22874429619361172,
+      "grad_norm": 1.038050651550293,
+      "learning_rate": 2.5488963227149566e-05,
+      "loss": 0.6118,
+      "step": 15841
+    },
+    {
+      "epoch": 0.2291919366949691,
+      "grad_norm": 1.0005477666854858,
+      "learning_rate": 2.5404994271913983e-05,
+      "loss": 0.6071,
+      "step": 15872
+    },
+    {
+      "epoch": 0.22963957719632647,
+      "grad_norm": 1.1400154829025269,
+      "learning_rate": 2.5321020746373085e-05,
+      "loss": 0.6073,
+      "step": 15903
+    },
+    {
+      "epoch": 0.2300872176976838,
+      "grad_norm": 0.9465575218200684,
+      "learning_rate": 2.52370435981567e-05,
+      "loss": 0.6139,
+      "step": 15934
+    },
+    {
+      "epoch": 0.23053485819904118,
+      "grad_norm": 0.9524116516113281,
+      "learning_rate": 2.5153063774935533e-05,
+      "loss": 0.6112,
+      "step": 15965
+    },
+    {
+      "epoch": 0.23098249870039855,
+      "grad_norm": 1.0909959077835083,
+      "learning_rate": 2.506908222441045e-05,
+      "loss": 0.6062,
+      "step": 15996
+    },
+    {
+      "epoch": 0.2314301392017559,
+      "grad_norm": 0.9520925879478455,
+      "learning_rate": 2.498509989430187e-05,
+      "loss": 0.6066,
+      "step": 16027
+    },
+    {
+      "epoch": 0.23187777970311327,
+      "grad_norm": 0.9747080206871033,
+      "learning_rate": 2.4901117732338958e-05,
+      "loss": 0.6073,
+      "step": 16058
+    },
+    {
+      "epoch": 0.23232542020447064,
+      "grad_norm": 0.8820034265518188,
+      "learning_rate": 2.481713668624899e-05,
+      "loss": 0.6042,
+      "step": 16089
+    },
+    {
+      "epoch": 0.23277306070582798,
+      "grad_norm": 0.873534619808197,
+      "learning_rate": 2.4733157703746663e-05,
+      "loss": 0.6115,
+      "step": 16120
+    },
+    {
+      "epoch": 0.23322070120718535,
+      "grad_norm": 1.0529483556747437,
+      "learning_rate": 2.4649181732523392e-05,
+      "loss": 0.604,
+      "step": 16151
+    },
+    {
+      "epoch": 0.23366834170854273,
+      "grad_norm": 1.0236808061599731,
+      "learning_rate": 2.4565209720236582e-05,
+      "loss": 0.6109,
+      "step": 16182
+    },
+    {
+      "epoch": 0.23411598220990007,
+      "grad_norm": 0.926750898361206,
+      "learning_rate": 2.4481242614498975e-05,
+      "loss": 0.6103,
+      "step": 16213
+    },
+    {
+      "epoch": 0.23456362271125744,
+      "grad_norm": 0.9616347551345825,
+      "learning_rate": 2.439728136286796e-05,
+      "loss": 0.6115,
+      "step": 16244
+    },
+    {
+      "epoch": 0.23501126321261478,
+      "grad_norm": 0.9181815981864929,
+      "learning_rate": 2.4313326912834852e-05,
+      "loss": 0.6093,
+      "step": 16275
+    },
+    {
+      "epoch": 0.23545890371397216,
+      "grad_norm": 1.08785879611969,
+      "learning_rate": 2.4229380211814206e-05,
+      "loss": 0.6056,
+      "step": 16306
+    },
+    {
+      "epoch": 0.23590654421532953,
+      "grad_norm": 1.3456270694732666,
+      "learning_rate": 2.4145442207133124e-05,
+      "loss": 0.5999,
+      "step": 16337
+    },
+    {
+      "epoch": 0.23635418471668687,
+      "grad_norm": 0.9001489281654358,
+      "learning_rate": 2.406151384602059e-05,
+      "loss": 0.6147,
+      "step": 16368
+    },
+    {
+      "epoch": 0.23680182521804424,
+      "grad_norm": 0.9228829145431519,
+      "learning_rate": 2.3977596075596747e-05,
+      "loss": 0.6089,
+      "step": 16399
+    },
+    {
+      "epoch": 0.23724946571940161,
+      "grad_norm": 0.8185672760009766,
+      "learning_rate": 2.3893689842862223e-05,
+      "loss": 0.6064,
+      "step": 16430
+    },
+    {
+      "epoch": 0.23769710622075896,
+      "grad_norm": 0.848855197429657,
+      "learning_rate": 2.3809796094687475e-05,
+      "loss": 0.6078,
+      "step": 16461
+    },
+    {
+      "epoch": 0.23814474672211633,
+      "grad_norm": 1.5285366773605347,
+      "learning_rate": 2.372591577780202e-05,
+      "loss": 0.6016,
+      "step": 16492
+    },
+    {
+      "epoch": 0.2385923872234737,
+      "grad_norm": 1.0771571397781372,
+      "learning_rate": 2.3642049838783838e-05,
+      "loss": 0.6132,
+      "step": 16523
+    },
+    {
+      "epoch": 0.23904002772483104,
+      "grad_norm": 0.8987991809844971,
+      "learning_rate": 2.3558199224048666e-05,
+      "loss": 0.6098,
+      "step": 16554
+    },
+    {
+      "epoch": 0.23948766822618842,
+      "grad_norm": 0.8981488943099976,
+      "learning_rate": 2.347436487983929e-05,
+      "loss": 0.6168,
+      "step": 16585
+    },
+    {
+      "epoch": 0.2399353087275458,
+      "grad_norm": 0.9029120802879333,
+      "learning_rate": 2.3390547752214888e-05,
+      "loss": 0.6116,
+      "step": 16616
+    },
+    {
+      "epoch": 0.24038294922890313,
+      "grad_norm": 1.0437650680541992,
+      "learning_rate": 2.330674878704035e-05,
+      "loss": 0.599,
+      "step": 16647
+    },
+    {
+      "epoch": 0.2408305897302605,
+      "grad_norm": 0.9616511464118958,
+      "learning_rate": 2.322296892997561e-05,
+      "loss": 0.614,
+      "step": 16678
+    },
+    {
+      "epoch": 0.24127823023161785,
+      "grad_norm": 0.8985153436660767,
+      "learning_rate": 2.313920912646497e-05,
+      "loss": 0.6087,
+      "step": 16709
+    },
+    {
+      "epoch": 0.24172587073297522,
+      "grad_norm": 1.0448508262634277,
+      "learning_rate": 2.305547032172643e-05,
+      "loss": 0.6062,
+      "step": 16740
+    },
+    {
+      "epoch": 0.2421735112343326,
+      "grad_norm": 0.9185760021209717,
+      "learning_rate": 2.2971753460741014e-05,
+      "loss": 0.6099,
+      "step": 16771
+    },
+    {
+      "epoch": 0.24262115173568993,
+      "grad_norm": 1.1951557397842407,
+      "learning_rate": 2.288805948824212e-05,
+      "loss": 0.6091,
+      "step": 16802
+    },
+    {
+      "epoch": 0.2430687922370473,
+      "grad_norm": 0.8947639465332031,
+      "learning_rate": 2.2804389348704858e-05,
+      "loss": 0.6101,
+      "step": 16833
+    },
+    {
+      "epoch": 0.24351643273840468,
+      "grad_norm": 1.0335516929626465,
+      "learning_rate": 2.2720743986335374e-05,
+      "loss": 0.6053,
+      "step": 16864
+    },
+    {
+      "epoch": 0.24396407323976202,
+      "grad_norm": 0.9719113111495972,
+      "learning_rate": 2.2637124345060233e-05,
+      "loss": 0.6093,
+      "step": 16895
+    },
+    {
+      "epoch": 0.2444117137411194,
+      "grad_norm": 0.9017343521118164,
+      "learning_rate": 2.2553531368515695e-05,
+      "loss": 0.6096,
+      "step": 16926
+    },
+    {
+      "epoch": 0.24485935424247676,
+      "grad_norm": 0.9254065155982971,
+      "learning_rate": 2.2469966000037144e-05,
+      "loss": 0.6031,
+      "step": 16957
+    },
+    {
+      "epoch": 0.2453069947438341,
+      "grad_norm": 0.9550548195838928,
+      "learning_rate": 2.2386429182648417e-05,
+      "loss": 0.6136,
+      "step": 16988
+    },
+    {
+      "epoch": 0.24575463524519148,
+      "grad_norm": 0.913746178150177,
+      "learning_rate": 2.230292185905114e-05,
+      "loss": 0.6041,
+      "step": 17019
+    },
+    {
+      "epoch": 0.24620227574654885,
+      "grad_norm": 1.0998092889785767,
+      "learning_rate": 2.2219444971614116e-05,
+      "loss": 0.6031,
+      "step": 17050
+    },
+    {
+      "epoch": 0.2466499162479062,
+      "grad_norm": 0.8995510339736938,
+      "learning_rate": 2.2135999462362655e-05,
+      "loss": 0.6043,
+      "step": 17081
+    },
+    {
+      "epoch": 0.24709755674926356,
+      "grad_norm": 1.0682373046875,
+      "learning_rate": 2.2052586272968003e-05,
+      "loss": 0.6091,
+      "step": 17112
+    },
+    {
+      "epoch": 0.2475451972506209,
+      "grad_norm": 0.9658533334732056,
+      "learning_rate": 2.196920634473666e-05,
+      "loss": 0.6062,
+      "step": 17143
+    },
+    {
+      "epoch": 0.24799283775197828,
+      "grad_norm": 0.9547036290168762,
+      "learning_rate": 2.1885860618599787e-05,
+      "loss": 0.6083,
+      "step": 17174
+    },
+    {
+      "epoch": 0.24844047825333565,
+      "grad_norm": 1.1252254247665405,
+      "learning_rate": 2.1802550035102577e-05,
+      "loss": 0.6047,
+      "step": 17205
+    },
+    {
+      "epoch": 0.248888118754693,
+      "grad_norm": 0.8774239420890808,
+      "learning_rate": 2.171927553439363e-05,
+      "loss": 0.6091,
+      "step": 17236
+    },
+    {
+      "epoch": 0.24933575925605037,
+      "grad_norm": 0.9929160475730896,
+      "learning_rate": 2.1636038056214376e-05,
+      "loss": 0.6037,
+      "step": 17267
+    },
+    {
+      "epoch": 0.24978339975740774,
+      "grad_norm": 1.0022073984146118,
+      "learning_rate": 2.155283853988844e-05,
+      "loss": 0.6106,
+      "step": 17298
+    },
+    {
+      "epoch": 0.2502310402587651,
+      "grad_norm": 0.9709188938140869,
+      "learning_rate": 2.146967792431106e-05,
+      "loss": 0.6043,
+      "step": 17329
+    },
+    {
+      "epoch": 0.25067868076012245,
+      "grad_norm": 0.9158416986465454,
+      "learning_rate": 2.138655714793849e-05,
+      "loss": 0.6002,
+      "step": 17360
+    },
+    {
+      "epoch": 0.2511263212614798,
+      "grad_norm": 1.045093059539795,
+      "learning_rate": 2.1303477148777367e-05,
+      "loss": 0.6027,
+      "step": 17391
+    },
+    {
+      "epoch": 0.2515739617628372,
+      "grad_norm": 0.9029024243354797,
+      "learning_rate": 2.122043886437421e-05,
+      "loss": 0.6095,
+      "step": 17422
+    },
+    {
+      "epoch": 0.2520216022641945,
+      "grad_norm": 1.0147509574890137,
+      "learning_rate": 2.1137443231804765e-05,
+      "loss": 0.6072,
+      "step": 17453
+    },
+    {
+      "epoch": 0.2524692427655519,
+      "grad_norm": 0.9794949293136597,
+      "learning_rate": 2.105449118766347e-05,
+      "loss": 0.6048,
+      "step": 17484
+    },
+    {
+      "epoch": 0.25291688326690925,
+      "grad_norm": 1.186495304107666,
+      "learning_rate": 2.097158366805287e-05,
+      "loss": 0.6079,
+      "step": 17515
+    },
+    {
+      "epoch": 0.2533645237682666,
+      "grad_norm": 0.9781451225280762,
+      "learning_rate": 2.0888721608573047e-05,
+      "loss": 0.6033,
+      "step": 17546
+    },
+    {
+      "epoch": 0.253812164269624,
+      "grad_norm": 0.9464316964149475,
+      "learning_rate": 2.0805905944311087e-05,
+      "loss": 0.6057,
+      "step": 17577
+    },
+    {
+      "epoch": 0.25425980477098137,
+      "grad_norm": 0.9456629753112793,
+      "learning_rate": 2.0723137609830497e-05,
+      "loss": 0.6039,
+      "step": 17608
+    },
+    {
+      "epoch": 0.2547074452723387,
+      "grad_norm": 0.9119940400123596,
+      "learning_rate": 2.0640417539160686e-05,
+      "loss": 0.6059,
+      "step": 17639
+    },
+    {
+      "epoch": 0.25515508577369606,
+      "grad_norm": 1.1009196043014526,
+      "learning_rate": 2.0557746665786427e-05,
+      "loss": 0.6081,
+      "step": 17670
+    },
+    {
+      "epoch": 0.2556027262750534,
+      "grad_norm": 1.010501503944397,
+      "learning_rate": 2.0475125922637256e-05,
+      "loss": 0.6081,
+      "step": 17701
+    },
+    {
+      "epoch": 0.2560503667764108,
+      "grad_norm": 0.9617831707000732,
+      "learning_rate": 2.0392556242077047e-05,
+      "loss": 0.6066,
+      "step": 17732
+    },
+    {
+      "epoch": 0.25649800727776817,
+      "grad_norm": 1.0574779510498047,
+      "learning_rate": 2.031003855589343e-05,
+      "loss": 0.6025,
+      "step": 17763
+    },
+    {
+      "epoch": 0.2569456477791255,
+      "grad_norm": 0.9515939950942993,
+      "learning_rate": 2.022757379528727e-05,
+      "loss": 0.6147,
+      "step": 17794
+    },
+    {
+      "epoch": 0.25739328828048286,
+      "grad_norm": 0.8629471659660339,
+      "learning_rate": 2.0145162890862184e-05,
+      "loss": 0.6018,
+      "step": 17825
+    },
+    {
+      "epoch": 0.25784092878184023,
+      "grad_norm": 1.0973188877105713,
+      "learning_rate": 2.0062806772614022e-05,
+      "loss": 0.5974,
+      "step": 17856
+    },
+    {
+      "epoch": 0.2582885692831976,
+      "grad_norm": 1.0111137628555298,
+      "learning_rate": 1.9980506369920392e-05,
+      "loss": 0.6007,
+      "step": 17887
+    },
+    {
+      "epoch": 0.25873620978455497,
+      "grad_norm": 0.938352644443512,
+      "learning_rate": 1.989826261153015e-05,
+      "loss": 0.6062,
+      "step": 17918
+    },
+    {
+      "epoch": 0.25918385028591234,
+      "grad_norm": 0.8754394054412842,
+      "learning_rate": 1.9816076425552923e-05,
+      "loss": 0.5999,
+      "step": 17949
+    },
+    {
+      "epoch": 0.25963149078726966,
+      "grad_norm": 0.9272274374961853,
+      "learning_rate": 1.9733948739448676e-05,
+      "loss": 0.5958,
+      "step": 17980
+    },
+    {
+      "epoch": 0.26007913128862703,
+      "grad_norm": 0.9161437749862671,
+      "learning_rate": 1.9651880480017155e-05,
+      "loss": 0.6068,
+      "step": 18011
+    },
+    {
+      "epoch": 0.2605267717899844,
+      "grad_norm": 1.0073903799057007,
+      "learning_rate": 1.9569872573387516e-05,
+      "loss": 0.6075,
+      "step": 18042
+    },
+    {
+      "epoch": 0.2609744122913418,
+      "grad_norm": 0.8590899705886841,
+      "learning_rate": 1.9487925945007854e-05,
+      "loss": 0.6017,
+      "step": 18073
+    },
+    {
+      "epoch": 0.26142205279269914,
+      "grad_norm": 0.9320747256278992,
+      "learning_rate": 1.9406041519634726e-05,
+      "loss": 0.6025,
+      "step": 18104
+    },
+    {
+      "epoch": 0.2618696932940565,
+      "grad_norm": 1.000109076499939,
+      "learning_rate": 1.932422022132275e-05,
+      "loss": 0.6025,
+      "step": 18135
+    },
+    {
+      "epoch": 0.26231733379541383,
+      "grad_norm": 0.8880858421325684,
+      "learning_rate": 1.924246297341414e-05,
+      "loss": 0.6029,
+      "step": 18166
+    },
+    {
+      "epoch": 0.2627649742967712,
+      "grad_norm": 0.9839984178543091,
+      "learning_rate": 1.9160770698528338e-05,
+      "loss": 0.6009,
+      "step": 18197
+    },
+    {
+      "epoch": 0.2632126147981286,
+      "grad_norm": 0.9712537527084351,
+      "learning_rate": 1.907914431855156e-05,
+      "loss": 0.6005,
+      "step": 18228
+    },
+    {
+      "epoch": 0.26366025529948595,
+      "grad_norm": 0.9593982100486755,
+      "learning_rate": 1.8997584754626412e-05,
+      "loss": 0.5967,
+      "step": 18259
+    },
+    {
+      "epoch": 0.2641078958008433,
+      "grad_norm": 0.9100329279899597,
+      "learning_rate": 1.8916092927141486e-05,
+      "loss": 0.5926,
+      "step": 18290
+    },
+    {
+      "epoch": 0.26455553630220063,
+      "grad_norm": 0.8858036398887634,
+      "learning_rate": 1.883466975572098e-05,
+      "loss": 0.5972,
+      "step": 18321
+    },
+    {
+      "epoch": 0.265003176803558,
+      "grad_norm": 1.0127744674682617,
+      "learning_rate": 1.8753316159214312e-05,
+      "loss": 0.6029,
+      "step": 18352
+    },
+    {
+      "epoch": 0.2654508173049154,
+      "grad_norm": 0.9447472095489502,
+      "learning_rate": 1.8672033055685766e-05,
+      "loss": 0.6066,
+      "step": 18383
+    },
+    {
+      "epoch": 0.26589845780627275,
+      "grad_norm": 0.818134605884552,
+      "learning_rate": 1.8590821362404116e-05,
+      "loss": 0.5953,
+      "step": 18414
+    },
+    {
+      "epoch": 0.2663460983076301,
+      "grad_norm": 0.9979908466339111,
+      "learning_rate": 1.8509681995832294e-05,
+      "loss": 0.5978,
+      "step": 18445
+    },
+    {
+      "epoch": 0.2667937388089875,
+      "grad_norm": 0.8588074445724487,
+      "learning_rate": 1.8428615871617004e-05,
+      "loss": 0.6004,
+      "step": 18476
+    },
+    {
+      "epoch": 0.2672413793103448,
+      "grad_norm": 0.9643010497093201,
+      "learning_rate": 1.8347623904578448e-05,
+      "loss": 0.6071,
+      "step": 18507
+    },
+    {
+      "epoch": 0.2676890198117022,
+      "grad_norm": 0.8365680575370789,
+      "learning_rate": 1.8266707008699975e-05,
+      "loss": 0.5998,
+      "step": 18538
+    },
+    {
+      "epoch": 0.26813666031305955,
+      "grad_norm": 0.8986954689025879,
+      "learning_rate": 1.818586609711774e-05,
+      "loss": 0.5982,
+      "step": 18569
+    },
+    {
+      "epoch": 0.2685843008144169,
+      "grad_norm": 1.0341336727142334,
+      "learning_rate": 1.8105102082110462e-05,
+      "loss": 0.6008,
+      "step": 18600
+    },
+    {
+      "epoch": 0.2690319413157743,
+      "grad_norm": 1.0030567646026611,
+      "learning_rate": 1.8024415875089058e-05,
+      "loss": 0.6011,
+      "step": 18631
+    },
+    {
+      "epoch": 0.2694795818171316,
+      "grad_norm": 0.9385823607444763,
+      "learning_rate": 1.7943808386586407e-05,
+      "loss": 0.6077,
+      "step": 18662
+    },
+    {
+      "epoch": 0.269927222318489,
+      "grad_norm": 0.8827871680259705,
+      "learning_rate": 1.7863280526247073e-05,
+      "loss": 0.6073,
+      "step": 18693
+    },
+    {
+      "epoch": 0.27037486281984635,
+      "grad_norm": 0.9739916324615479,
+      "learning_rate": 1.7782833202817003e-05,
+      "loss": 0.595,
+      "step": 18724
+    },
+    {
+      "epoch": 0.2708225033212037,
+      "grad_norm": 0.9108980298042297,
+      "learning_rate": 1.7702467324133327e-05,
+      "loss": 0.587,
+      "step": 18755
+    },
+    {
+      "epoch": 0.2712701438225611,
+      "grad_norm": 1.0579863786697388,
+      "learning_rate": 1.7622183797114042e-05,
+      "loss": 0.6043,
+      "step": 18786
+    },
+    {
+      "epoch": 0.27171778432391847,
+      "grad_norm": 0.9881874322891235,
+      "learning_rate": 1.7541983527747838e-05,
+      "loss": 0.5905,
+      "step": 18817
+    },
+    {
+      "epoch": 0.2721654248252758,
+      "grad_norm": 0.9560896158218384,
+      "learning_rate": 1.746186742108387e-05,
+      "loss": 0.6033,
+      "step": 18848
+    },
+    {
+      "epoch": 0.27261306532663315,
+      "grad_norm": 0.9506632685661316,
+      "learning_rate": 1.73818363812215e-05,
+      "loss": 0.5935,
+      "step": 18879
+    },
+    {
+      "epoch": 0.2730607058279905,
+      "grad_norm": 0.9935999512672424,
+      "learning_rate": 1.7301891311300153e-05,
+      "loss": 0.5997,
+      "step": 18910
+    },
+    {
+      "epoch": 0.2735083463293479,
+      "grad_norm": 0.9102685451507568,
+      "learning_rate": 1.7222033113489055e-05,
+      "loss": 0.5982,
+      "step": 18941
+    },
+    {
+      "epoch": 0.27395598683070527,
+      "grad_norm": 1.0436829328536987,
+      "learning_rate": 1.7142262688977127e-05,
+      "loss": 0.603,
+      "step": 18972
+    },
+    {
+      "epoch": 0.27440362733206264,
+      "grad_norm": 1.0441209077835083,
+      "learning_rate": 1.7062580937962764e-05,
+      "loss": 0.5957,
+      "step": 19003
+    },
+    {
+      "epoch": 0.27485126783341995,
+      "grad_norm": 0.9903119206428528,
+      "learning_rate": 1.698298875964369e-05,
+      "loss": 0.5972,
+      "step": 19034
+    },
+    {
+      "epoch": 0.2752989083347773,
+      "grad_norm": 0.8914598226547241,
+      "learning_rate": 1.690348705220684e-05,
+      "loss": 0.6027,
+      "step": 19065
+    },
+    {
+      "epoch": 0.2757465488361347,
+      "grad_norm": 0.9678306579589844,
+      "learning_rate": 1.6824076712818156e-05,
+      "loss": 0.5999,
+      "step": 19096
+    },
+    {
+      "epoch": 0.27619418933749207,
+      "grad_norm": 0.9159491658210754,
+      "learning_rate": 1.6744758637612533e-05,
+      "loss": 0.6029,
+      "step": 19127
+    },
+    {
+      "epoch": 0.27664182983884944,
+      "grad_norm": 1.0948030948638916,
+      "learning_rate": 1.6665533721683664e-05,
+      "loss": 0.604,
+      "step": 19158
+    },
+    {
+      "epoch": 0.27708947034020676,
+      "grad_norm": 0.866001546382904,
+      "learning_rate": 1.6586402859073974e-05,
+      "loss": 0.5863,
+      "step": 19189
+    },
+    {
+      "epoch": 0.2775371108415641,
+      "grad_norm": 1.0278693437576294,
+      "learning_rate": 1.6507366942764463e-05,
+      "loss": 0.5937,
+      "step": 19220
+    },
+    {
+      "epoch": 0.2779847513429215,
+      "grad_norm": 0.9074748158454895,
+      "learning_rate": 1.6428426864664732e-05,
+      "loss": 0.602,
+      "step": 19251
+    },
+    {
+      "epoch": 0.27843239184427887,
+      "grad_norm": 0.9951406717300415,
+      "learning_rate": 1.6349583515602816e-05,
+      "loss": 0.5982,
+      "step": 19282
+    },
+    {
+      "epoch": 0.27888003234563624,
+      "grad_norm": 1.0565474033355713,
+      "learning_rate": 1.6270837785315208e-05,
+      "loss": 0.6008,
+      "step": 19313
+    },
+    {
+      "epoch": 0.2793276728469936,
+      "grad_norm": 0.9266191124916077,
+      "learning_rate": 1.619219056243676e-05,
+      "loss": 0.5994,
+      "step": 19344
+    },
+    {
+      "epoch": 0.27977531334835093,
+      "grad_norm": 0.8990464806556702,
+      "learning_rate": 1.6113642734490698e-05,
+      "loss": 0.5984,
+      "step": 19375
+    },
+    {
+      "epoch": 0.2802229538497083,
+      "grad_norm": 0.9231170415878296,
+      "learning_rate": 1.6035195187878577e-05,
+      "loss": 0.5952,
+      "step": 19406
+    },
+    {
+      "epoch": 0.28067059435106567,
+      "grad_norm": 1.035946011543274,
+      "learning_rate": 1.5956848807870305e-05,
+      "loss": 0.5985,
+      "step": 19437
+    },
+    {
+      "epoch": 0.28111823485242304,
+      "grad_norm": 0.8787546157836914,
+      "learning_rate": 1.587860447859413e-05,
+      "loss": 0.5999,
+      "step": 19468
+    },
+    {
+      "epoch": 0.2815658753537804,
+      "grad_norm": 0.8387063145637512,
+      "learning_rate": 1.5800463083026686e-05,
+      "loss": 0.5973,
+      "step": 19499
+    },
+    {
+      "epoch": 0.28201351585513773,
+      "grad_norm": 1.025985598564148,
+      "learning_rate": 1.572242550298298e-05,
+      "loss": 0.597,
+      "step": 19530
+    },
+    {
+      "epoch": 0.2824611563564951,
+      "grad_norm": 0.9072343707084656,
+      "learning_rate": 1.56444926191065e-05,
+      "loss": 0.5868,
+      "step": 19561
+    },
+    {
+      "epoch": 0.2829087968578525,
+      "grad_norm": 0.9914515614509583,
+      "learning_rate": 1.5566665310859257e-05,
+      "loss": 0.5926,
+      "step": 19592
+    },
+    {
+      "epoch": 0.28335643735920985,
+      "grad_norm": 0.9568142294883728,
+      "learning_rate": 1.5488944456511846e-05,
+      "loss": 0.6023,
+      "step": 19623
+    },
+    {
+      "epoch": 0.2838040778605672,
+      "grad_norm": 0.8508808016777039,
+      "learning_rate": 1.5411330933133546e-05,
+      "loss": 0.5991,
+      "step": 19654
+    },
+    {
+      "epoch": 0.2842517183619246,
+      "grad_norm": 0.9583558440208435,
+      "learning_rate": 1.533382561658241e-05,
+      "loss": 0.6031,
+      "step": 19685
+    },
+    {
+      "epoch": 0.2846993588632819,
+      "grad_norm": 0.9079626798629761,
+      "learning_rate": 1.525642938149541e-05,
+      "loss": 0.6021,
+      "step": 19716
+    },
+    {
+      "epoch": 0.2851469993646393,
+      "grad_norm": 0.8839224576950073,
+      "learning_rate": 1.5179143101278536e-05,
+      "loss": 0.5974,
+      "step": 19747
+    },
+    {
+      "epoch": 0.28559463986599665,
+      "grad_norm": 0.9244747161865234,
+      "learning_rate": 1.5101967648096955e-05,
+      "loss": 0.5951,
+      "step": 19778
+    },
+    {
+      "epoch": 0.286042280367354,
+      "grad_norm": 0.937430202960968,
+      "learning_rate": 1.5024903892865172e-05,
+      "loss": 0.5981,
+      "step": 19809
+    },
+    {
+      "epoch": 0.2864899208687114,
+      "grad_norm": 1.0390359163284302,
+      "learning_rate": 1.4947952705237184e-05,
+      "loss": 0.6017,
+      "step": 19840
+    },
+    {
+      "epoch": 0.28693756137006876,
+      "grad_norm": 0.9726883172988892,
+      "learning_rate": 1.4871114953596682e-05,
+      "loss": 0.5956,
+      "step": 19871
+    },
+    {
+      "epoch": 0.2873852018714261,
+      "grad_norm": 0.8611225485801697,
+      "learning_rate": 1.4794391505047256e-05,
+      "loss": 0.5875,
+      "step": 19902
+    },
+    {
+      "epoch": 0.28783284237278345,
+      "grad_norm": 0.9599292278289795,
+      "learning_rate": 1.4717783225402596e-05,
+      "loss": 0.5948,
+      "step": 19933
+    },
+    {
+      "epoch": 0.2882804828741408,
+      "grad_norm": 0.9473167061805725,
+      "learning_rate": 1.4641290979176735e-05,
+      "loss": 0.5967,
+      "step": 19964
+    },
+    {
+      "epoch": 0.2887281233754982,
+      "grad_norm": 0.9631912708282471,
+      "learning_rate": 1.4564915629574246e-05,
+      "loss": 0.5962,
+      "step": 19995
+    },
+    {
+      "epoch": 0.28917576387685556,
+      "grad_norm": 0.9674975872039795,
+      "learning_rate": 1.4488658038480601e-05,
+      "loss": 0.59,
+      "step": 20026
+    },
+    {
+      "epoch": 0.2896234043782129,
+      "grad_norm": 1.1209561824798584,
+      "learning_rate": 1.4412519066452323e-05,
+      "loss": 0.6032,
+      "step": 20057
+    },
+    {
+      "epoch": 0.29007104487957025,
+      "grad_norm": 0.9360538125038147,
+      "learning_rate": 1.4336499572707373e-05,
+      "loss": 0.5975,
+      "step": 20088
+    },
+    {
+      "epoch": 0.2905186853809276,
+      "grad_norm": 0.9791879653930664,
+      "learning_rate": 1.4260600415115433e-05,
+      "loss": 0.6051,
+      "step": 20119
+    },
+    {
+      "epoch": 0.290966325882285,
+      "grad_norm": 1.0199767351150513,
+      "learning_rate": 1.4184822450188137e-05,
+      "loss": 0.5912,
+      "step": 20150
+    },
+    {
+      "epoch": 0.29141396638364236,
+      "grad_norm": 0.8803568482398987,
+      "learning_rate": 1.410916653306954e-05,
+      "loss": 0.6007,
+      "step": 20181
+    },
+    {
+      "epoch": 0.29186160688499974,
+      "grad_norm": 0.9544051289558411,
+      "learning_rate": 1.403363351752639e-05,
+      "loss": 0.5901,
+      "step": 20212
+    },
+    {
+      "epoch": 0.29230924738635705,
+      "grad_norm": 1.0661756992340088,
+      "learning_rate": 1.3958224255938485e-05,
+      "loss": 0.595,
+      "step": 20243
+    },
+    {
+      "epoch": 0.2927568878877144,
+      "grad_norm": 0.9343761801719666,
+      "learning_rate": 1.388293959928911e-05,
+      "loss": 0.6016,
+      "step": 20274
+    },
+    {
+      "epoch": 0.2932045283890718,
+      "grad_norm": 1.0200270414352417,
+      "learning_rate": 1.3807780397155379e-05,
+      "loss": 0.6,
+      "step": 20305
+    },
+    {
+      "epoch": 0.29365216889042917,
+      "grad_norm": 0.8452933430671692,
+      "learning_rate": 1.3732747497698655e-05,
+      "loss": 0.5978,
+      "step": 20336
+    },
+    {
+      "epoch": 0.29409980939178654,
+      "grad_norm": 1.000546932220459,
+      "learning_rate": 1.3657841747655038e-05,
+      "loss": 0.5933,
+      "step": 20367
+    },
+    {
+      "epoch": 0.29454744989314385,
+      "grad_norm": 0.9047265648841858,
+      "learning_rate": 1.3583063992325706e-05,
+      "loss": 0.5954,
+      "step": 20398
+    },
+    {
+      "epoch": 0.2949950903945012,
+      "grad_norm": 0.877160906791687,
+      "learning_rate": 1.3508415075567496e-05,
+      "loss": 0.5921,
+      "step": 20429
+    },
+    {
+      "epoch": 0.2954427308958586,
+      "grad_norm": 1.0855872631072998,
+      "learning_rate": 1.343389583978327e-05,
+      "loss": 0.5992,
+      "step": 20460
+    },
+    {
+      "epoch": 0.29589037139721597,
+      "grad_norm": 1.006057620048523,
+      "learning_rate": 1.3359507125912468e-05,
+      "loss": 0.5916,
+      "step": 20491
+    },
+    {
+      "epoch": 0.29633801189857334,
+      "grad_norm": 1.003037929534912,
+      "learning_rate": 1.3285249773421627e-05,
+      "loss": 0.5918,
+      "step": 20522
+    },
+    {
+      "epoch": 0.2967856523999307,
+      "grad_norm": 0.9983749389648438,
+      "learning_rate": 1.3211124620294884e-05,
+      "loss": 0.5922,
+      "step": 20553
+    },
+    {
+      "epoch": 0.297233292901288,
+      "grad_norm": 1.0387030839920044,
+      "learning_rate": 1.313713250302451e-05,
+      "loss": 0.5991,
+      "step": 20584
+    },
+    {
+      "epoch": 0.2976809334026454,
+      "grad_norm": 0.8586576581001282,
+      "learning_rate": 1.3063274256601479e-05,
+      "loss": 0.6001,
+      "step": 20615
+    },
+    {
+      "epoch": 0.29812857390400277,
+      "grad_norm": 0.9596696496009827,
+      "learning_rate": 1.2989550714506086e-05,
+      "loss": 0.5988,
+      "step": 20646
+    },
+    {
+      "epoch": 0.29857621440536014,
+      "grad_norm": 0.9584054350852966,
+      "learning_rate": 1.291596270869846e-05,
+      "loss": 0.5946,
+      "step": 20677
+    },
+    {
+      "epoch": 0.2990238549067175,
+      "grad_norm": 1.02814519405365,
+      "learning_rate": 1.284251106960927e-05,
+      "loss": 0.5941,
+      "step": 20708
+    },
+    {
+      "epoch": 0.2994714954080749,
+      "grad_norm": 1.1163685321807861,
+      "learning_rate": 1.2769196626130263e-05,
+      "loss": 0.6002,
+      "step": 20739
+    },
+    {
+      "epoch": 0.2999191359094322,
+      "grad_norm": 0.9234864711761475,
+      "learning_rate": 1.2696020205604969e-05,
+      "loss": 0.5919,
+      "step": 20770
+    },
+    {
+      "epoch": 0.30036677641078957,
+      "grad_norm": 0.9402379393577576,
+      "learning_rate": 1.2622982633819359e-05,
+      "loss": 0.5931,
+      "step": 20801
+    },
+    {
+      "epoch": 0.30081441691214694,
+      "grad_norm": 0.9681121110916138,
+      "learning_rate": 1.2550084734992484e-05,
+      "loss": 0.5904,
+      "step": 20832
+    },
+    {
+      "epoch": 0.3012620574135043,
+      "grad_norm": 0.9508892893791199,
+      "learning_rate": 1.247732733176724e-05,
+      "loss": 0.5997,
+      "step": 20863
+    },
+    {
+      "epoch": 0.3017096979148617,
+      "grad_norm": 0.8664924502372742,
+      "learning_rate": 1.2404711245201044e-05,
+      "loss": 0.594,
+      "step": 20894
+    },
+    {
+      "epoch": 0.302157338416219,
+      "grad_norm": 0.8919743299484253,
+      "learning_rate": 1.2332237294756535e-05,
+      "loss": 0.5873,
+      "step": 20925
+    },
+    {
+      "epoch": 0.3026049789175764,
+      "grad_norm": 0.9090976119041443,
+      "learning_rate": 1.225990629829241e-05,
+      "loss": 0.5966,
+      "step": 20956
+    },
+    {
+      "epoch": 0.30305261941893374,
+      "grad_norm": 0.8878434896469116,
+      "learning_rate": 1.2187719072054136e-05,
+      "loss": 0.5939,
+      "step": 20987
+    },
+    {
+      "epoch": 0.3035002599202911,
+      "grad_norm": 0.8897850513458252,
+      "learning_rate": 1.2115676430664735e-05,
+      "loss": 0.5978,
+      "step": 21018
+    },
+    {
+      "epoch": 0.3039479004216485,
+      "grad_norm": 0.8866651654243469,
+      "learning_rate": 1.2043779187115647e-05,
+      "loss": 0.5873,
+      "step": 21049
+    },
+    {
+      "epoch": 0.30439554092300586,
+      "grad_norm": 0.8519348502159119,
+      "learning_rate": 1.1972028152757476e-05,
+      "loss": 0.5991,
+      "step": 21080
+    },
+    {
+      "epoch": 0.3048431814243632,
+      "grad_norm": 1.146201252937317,
+      "learning_rate": 1.1900424137290889e-05,
+      "loss": 0.5928,
+      "step": 21111
+    },
+    {
+      "epoch": 0.30529082192572055,
+      "grad_norm": 1.0777043104171753,
+      "learning_rate": 1.1828967948757482e-05,
+      "loss": 0.5966,
+      "step": 21142
+    },
+    {
+      "epoch": 0.3057384624270779,
+      "grad_norm": 1.0404378175735474,
+      "learning_rate": 1.175766039353062e-05,
+      "loss": 0.607,
+      "step": 21173
+    },
+    {
+      "epoch": 0.3061861029284353,
+      "grad_norm": 0.8684154152870178,
+      "learning_rate": 1.1686502276306382e-05,
+      "loss": 0.5992,
+      "step": 21204
+    },
+    {
+      "epoch": 0.30663374342979266,
+      "grad_norm": 0.9449039101600647,
+      "learning_rate": 1.1615494400094445e-05,
+      "loss": 0.5937,
+      "step": 21235
+    },
+    {
+      "epoch": 0.30708138393115,
+      "grad_norm": 0.9459973573684692,
+      "learning_rate": 1.1544637566209029e-05,
+      "loss": 0.5953,
+      "step": 21266
+    },
+    {
+      "epoch": 0.30752902443250735,
+      "grad_norm": 0.8467513918876648,
+      "learning_rate": 1.1473932574259886e-05,
+      "loss": 0.5937,
+      "step": 21297
+    },
+    {
+      "epoch": 0.3079766649338647,
+      "grad_norm": 0.953157901763916,
+      "learning_rate": 1.1403380222143247e-05,
+      "loss": 0.592,
+      "step": 21328
+    },
+    {
+      "epoch": 0.3084243054352221,
+      "grad_norm": 0.9762019515037537,
+      "learning_rate": 1.1332981306032808e-05,
+      "loss": 0.6009,
+      "step": 21359
+    },
+    {
+      "epoch": 0.30887194593657946,
+      "grad_norm": 0.7931903600692749,
+      "learning_rate": 1.1262736620370762e-05,
+      "loss": 0.5951,
+      "step": 21390
+    },
+    {
+      "epoch": 0.30931958643793683,
+      "grad_norm": 1.042128324508667,
+      "learning_rate": 1.1192646957858854e-05,
+      "loss": 0.5951,
+      "step": 21421
+    },
+    {
+      "epoch": 0.30976722693929415,
+      "grad_norm": 0.9942502379417419,
+      "learning_rate": 1.1122713109449381e-05,
+      "loss": 0.5945,
+      "step": 21452
+    },
+    {
+      "epoch": 0.3102148674406515,
+      "grad_norm": 0.9771155118942261,
+      "learning_rate": 1.105293586433634e-05,
+      "loss": 0.5929,
+      "step": 21483
+    },
+    {
+      "epoch": 0.3106625079420089,
+      "grad_norm": 0.9390444159507751,
+      "learning_rate": 1.0983316009946446e-05,
+      "loss": 0.5944,
+      "step": 21514
+    },
+    {
+      "epoch": 0.31111014844336626,
+      "grad_norm": 0.9289217591285706,
+      "learning_rate": 1.0913854331930282e-05,
+      "loss": 0.5902,
+      "step": 21545
+    },
+    {
+      "epoch": 0.31155778894472363,
+      "grad_norm": 0.8977670669555664,
+      "learning_rate": 1.0844551614153456e-05,
+      "loss": 0.5951,
+      "step": 21576
+    },
+    {
+      "epoch": 0.312005429446081,
+      "grad_norm": 0.9898940920829773,
+      "learning_rate": 1.0775408638687725e-05,
+      "loss": 0.5997,
+      "step": 21607
+    },
+    {
+      "epoch": 0.3124530699474383,
+      "grad_norm": 0.9756447076797485,
+      "learning_rate": 1.0706426185802165e-05,
+      "loss": 0.5969,
+      "step": 21638
+    },
+    {
+      "epoch": 0.3129007104487957,
+      "grad_norm": 1.0475540161132812,
+      "learning_rate": 1.0637605033954371e-05,
+      "loss": 0.5908,
+      "step": 21669
+    },
+    {
+      "epoch": 0.31334835095015307,
+      "grad_norm": 0.9765790700912476,
+      "learning_rate": 1.05689459597817e-05,
+      "loss": 0.5903,
+      "step": 21700
+    },
+    {
+      "epoch": 0.31379599145151044,
+      "grad_norm": 0.8677023649215698,
+      "learning_rate": 1.050044973809246e-05,
+      "loss": 0.5907,
+      "step": 21731
+    },
+    {
+      "epoch": 0.3142436319528678,
+      "grad_norm": 0.937731921672821,
+      "learning_rate": 1.043211714185722e-05,
+      "loss": 0.603,
+      "step": 21762
+    },
+    {
+      "epoch": 0.3146912724542251,
+      "grad_norm": 0.8233932256698608,
+      "learning_rate": 1.036394894220003e-05,
+      "loss": 0.5955,
+      "step": 21793
+    },
+    {
+      "epoch": 0.3151389129555825,
+      "grad_norm": 1.1260769367218018,
+      "learning_rate": 1.0295945908389751e-05,
+      "loss": 0.605,
+      "step": 21824
+    },
+    {
+      "epoch": 0.31558655345693987,
+      "grad_norm": 0.9366801977157593,
+      "learning_rate": 1.0228108807831393e-05,
+      "loss": 0.5963,
+      "step": 21855
+    },
+    {
+      "epoch": 0.31603419395829724,
+      "grad_norm": 0.871155321598053,
+      "learning_rate": 1.01604384060574e-05,
+      "loss": 0.5876,
+      "step": 21886
+    },
+    {
+      "epoch": 0.3164818344596546,
+      "grad_norm": 0.9532550573348999,
+      "learning_rate": 1.009293546671907e-05,
+      "loss": 0.5863,
+      "step": 21917
+    },
+    {
+      "epoch": 0.316929474961012,
+      "grad_norm": 1.045569658279419,
+      "learning_rate": 1.002560075157791e-05,
+      "loss": 0.5899,
+      "step": 21948
+    },
+    {
+      "epoch": 0.3173771154623693,
+      "grad_norm": 0.9291980862617493,
+      "learning_rate": 9.958435020496995e-06,
+      "loss": 0.5904,
+      "step": 21979
+    },
+    {
+      "epoch": 0.31782475596372667,
+      "grad_norm": 0.8881365060806274,
+      "learning_rate": 9.89143903143249e-06,
+      "loss": 0.5997,
+      "step": 22010
+    },
+    {
+      "epoch": 0.31827239646508404,
+      "grad_norm": 0.9601870179176331,
+      "learning_rate": 9.824613540425038e-06,
+      "loss": 0.5965,
+      "step": 22041
+    },
+    {
+      "epoch": 0.3187200369664414,
+      "grad_norm": 0.8519198298454285,
+      "learning_rate": 9.757959301591197e-06,
+      "loss": 0.5887,
+      "step": 22072
+    },
+    {
+      "epoch": 0.3191676774677988,
+      "grad_norm": 0.9262305498123169,
+      "learning_rate": 9.691477067115017e-06,
+      "loss": 0.5846,
+      "step": 22103
+    },
+    {
+      "epoch": 0.3196153179691561,
+      "grad_norm": 1.0259448289871216,
+      "learning_rate": 9.625167587239467e-06,
+      "loss": 0.5865,
+      "step": 22134
+    },
+    {
+      "epoch": 0.32006295847051347,
+      "grad_norm": 0.9057780504226685,
+      "learning_rate": 9.559031610258007e-06,
+      "loss": 0.592,
+      "step": 22165
+    },
+    {
+      "epoch": 0.32051059897187084,
+      "grad_norm": 0.905604362487793,
+      "learning_rate": 9.493069882506164e-06,
+      "loss": 0.5958,
+      "step": 22196
+    },
+    {
+      "epoch": 0.3209582394732282,
+      "grad_norm": 0.8837811946868896,
+      "learning_rate": 9.427283148353056e-06,
+      "loss": 0.5955,
+      "step": 22227
+    },
+    {
+      "epoch": 0.3214058799745856,
+      "grad_norm": 0.9125133752822876,
+      "learning_rate": 9.361672150193052e-06,
+      "loss": 0.5915,
+      "step": 22258
+    },
+    {
+      "epoch": 0.32185352047594296,
+      "grad_norm": 0.8553541898727417,
+      "learning_rate": 9.29623762843734e-06,
+      "loss": 0.586,
+      "step": 22289
+    },
+    {
+      "epoch": 0.32230116097730027,
+      "grad_norm": 0.8609781861305237,
+      "learning_rate": 9.230980321505594e-06,
+      "loss": 0.5867,
+      "step": 22320
+    },
+    {
+      "epoch": 0.32274880147865764,
+      "grad_norm": 0.8896780014038086,
+      "learning_rate": 9.165900965817668e-06,
+      "loss": 0.5862,
+      "step": 22351
+    },
+    {
+      "epoch": 0.323196441980015,
+      "grad_norm": 1.0318437814712524,
+      "learning_rate": 9.101000295785245e-06,
+      "loss": 0.5906,
+      "step": 22382
+    },
+    {
+      "epoch": 0.3236440824813724,
+      "grad_norm": 1.0346667766571045,
+      "learning_rate": 9.036279043803565e-06,
+      "loss": 0.594,
+      "step": 22413
+    },
+    {
+      "epoch": 0.32409172298272976,
+      "grad_norm": 0.899023175239563,
+      "learning_rate": 8.971737940243147e-06,
+      "loss": 0.5983,
+      "step": 22444
+    },
+    {
+      "epoch": 0.32453936348408713,
+      "grad_norm": 0.8427733182907104,
+      "learning_rate": 8.907377713441592e-06,
+      "loss": 0.5928,
+      "step": 22475
+    },
+    {
+      "epoch": 0.32498700398544444,
+      "grad_norm": 0.8469851613044739,
+      "learning_rate": 8.843199089695293e-06,
+      "loss": 0.5867,
+      "step": 22506
+    },
+    {
+      "epoch": 0.3254346444868018,
+      "grad_norm": 0.8703016638755798,
+      "learning_rate": 8.779202793251311e-06,
+      "loss": 0.5894,
+      "step": 22537
+    },
+    {
+      "epoch": 0.3258822849881592,
+      "grad_norm": 0.9438649415969849,
+      "learning_rate": 8.715389546299149e-06,
+      "loss": 0.5949,
+      "step": 22568
+    },
+    {
+      "epoch": 0.32632992548951656,
+      "grad_norm": 0.8361387848854065,
+      "learning_rate": 8.651760068962617e-06,
+      "loss": 0.5956,
+      "step": 22599
+    },
+    {
+      "epoch": 0.32677756599087393,
+      "grad_norm": 0.8810434341430664,
+      "learning_rate": 8.588315079291733e-06,
+      "loss": 0.5904,
+      "step": 22630
+    },
+    {
+      "epoch": 0.32722520649223125,
+      "grad_norm": 0.9140039682388306,
+      "learning_rate": 8.52505529325457e-06,
+      "loss": 0.5871,
+      "step": 22661
+    },
+    {
+      "epoch": 0.3276728469935886,
+      "grad_norm": 0.8848084211349487,
+      "learning_rate": 8.461981424729216e-06,
+      "loss": 0.5973,
+      "step": 22692
+    },
+    {
+      "epoch": 0.328120487494946,
+      "grad_norm": 0.8551177382469177,
+      "learning_rate": 8.399094185495725e-06,
+      "loss": 0.5925,
+      "step": 22723
+    },
+    {
+      "epoch": 0.32856812799630336,
+      "grad_norm": 0.9873132705688477,
+      "learning_rate": 8.336394285228017e-06,
+      "loss": 0.592,
+      "step": 22754
+    },
+    {
+      "epoch": 0.32901576849766073,
+      "grad_norm": 0.9582761526107788,
+      "learning_rate": 8.273882431485952e-06,
+      "loss": 0.5957,
+      "step": 22785
+    },
+    {
+      "epoch": 0.3294634089990181,
+      "grad_norm": 0.9337429404258728,
+      "learning_rate": 8.211559329707316e-06,
+      "loss": 0.5893,
+      "step": 22816
+    },
+    {
+      "epoch": 0.3299110495003754,
+      "grad_norm": 0.8926681280136108,
+      "learning_rate": 8.149425683199823e-06,
+      "loss": 0.593,
+      "step": 22847
+    },
+    {
+      "epoch": 0.3303586900017328,
+      "grad_norm": 0.8568328022956848,
+      "learning_rate": 8.08748219313325e-06,
+      "loss": 0.5895,
+      "step": 22878
+    }
+  ],
+  "logging_steps": 31,
+  "max_steps": 30517,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 7630,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.6887420591349432e+19,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-22890/training_args.bin b/checkpoint-22890/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542
--- /dev/null
+++ b/checkpoint-22890/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3
+size 5432
diff --git a/checkpoint-30517/config.json b/checkpoint-30517/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09
--- /dev/null
+++ b/checkpoint-30517/config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "meta-llama/Llama-3.1-8B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/checkpoint-30517/generation_config.json b/checkpoint-30517/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507
--- /dev/null
+++ b/checkpoint-30517/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.47.0.dev0"
+}
diff --git a/checkpoint-30517/model-00001-of-00007.safetensors b/checkpoint-30517/model-00001-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..542f9d7381f168eb0b0a63a75a9adf93a5deee06
--- /dev/null
+++ b/checkpoint-30517/model-00001-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13878f97ef55d85d9b352c717dba406c909afe1bae3e88a6a4424a428c0bccc6
+size 4886466168
diff --git a/checkpoint-30517/model-00002-of-00007.safetensors b/checkpoint-30517/model-00002-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961
--- /dev/null
+++ b/checkpoint-30517/model-00002-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64
+size 4832007448
diff --git a/checkpoint-30517/model-00003-of-00007.safetensors b/checkpoint-30517/model-00003-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff
--- /dev/null
+++ b/checkpoint-30517/model-00003-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97
+size 4999813112
diff --git a/checkpoint-30517/model-00004-of-00007.safetensors b/checkpoint-30517/model-00004-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a
--- /dev/null
+++ b/checkpoint-30517/model-00004-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042
+size 4999813128
diff --git a/checkpoint-30517/model-00005-of-00007.safetensors b/checkpoint-30517/model-00005-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89
--- /dev/null
+++ b/checkpoint-30517/model-00005-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7
+size 4832007496
diff --git a/checkpoint-30517/model-00006-of-00007.safetensors b/checkpoint-30517/model-00006-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..524e69fde1febd3510b4725ea7cfc5103d79e8d1
--- /dev/null
+++ b/checkpoint-30517/model-00006-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edf01cd5fe74bfd002b701e409ad8e68a02b64c59f6cfb2302b9427953c464f3
+size 4999813120
diff --git a/checkpoint-30517/model-00007-of-00007.safetensors b/checkpoint-30517/model-00007-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b73e0db4c4b0097276bbc78a364a5ff57002d272
--- /dev/null
+++ b/checkpoint-30517/model-00007-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9dd6df4fc7f009aa39a3b019a55cb97a8509bb44690419b3255311faaf9e89b
+size 2571158184
diff --git a/checkpoint-30517/model.safetensors.index.json b/checkpoint-30517/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13
--- /dev/null
+++ b/checkpoint-30517/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 32121044992
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.norm.weight": "model-00007-of-00007.safetensors"
+  }
+}
diff --git a/checkpoint-30517/optimizer.pt b/checkpoint-30517/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..82e8d1479868d74224afb62a58ff9a716f173ee2
--- /dev/null
+++ b/checkpoint-30517/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78882a3bd9b13b3cc04e81dbcb7a5549988fdbd8420fa4d16b5bf8114af0a37e
+size 15385036334
diff --git a/checkpoint-30517/rng_state.pth b/checkpoint-30517/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c
--- /dev/null
+++ b/checkpoint-30517/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244
diff --git a/checkpoint-30517/scheduler.pt b/checkpoint-30517/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1d585b62f4a9bc8119a4f1d03cf2bb269e99411b
--- /dev/null
+++ b/checkpoint-30517/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d91c597306f26392e8da0d33a09fdcca77cfdeb5ad42248045521772fa3d64e
+size 1064
diff --git a/checkpoint-30517/trainer_state.json b/checkpoint-30517/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..70ec5cf1ca59042ec3c40d0130e41247e284e3a1
--- /dev/null
+++ b/checkpoint-30517/trainer_state.json
@@ -0,0 +1,6921 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4406659735458904,
+  "eval_steps": 500,
+  "global_step": 30517,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0004476405013573615,
+      "grad_norm": 4.6696085929870605,
+      "learning_rate": 1.0157273918741808e-06,
+      "loss": 0.9366,
+      "step": 31
+    },
+    {
+      "epoch": 0.000895281002714723,
+      "grad_norm": 4.250915050506592,
+      "learning_rate": 2.0314547837483616e-06,
+      "loss": 0.9002,
+      "step": 62
+    },
+    {
+      "epoch": 0.0013429215040720846,
+      "grad_norm": 4.424270153045654,
+      "learning_rate": 3.0471821756225426e-06,
+      "loss": 0.8843,
+      "step": 93
+    },
+    {
+      "epoch": 0.001790562005429446,
+      "grad_norm": 4.56964635848999,
+      "learning_rate": 4.062909567496723e-06,
+      "loss": 0.8717,
+      "step": 124
+    },
+    {
+      "epoch": 0.0022382025067868077,
+      "grad_norm": 4.051624298095703,
+      "learning_rate": 5.078636959370905e-06,
+      "loss": 0.8711,
+      "step": 155
+    },
+    {
+      "epoch": 0.002685843008144169,
+      "grad_norm": 3.98006272315979,
+      "learning_rate": 6.094364351245085e-06,
+      "loss": 0.8628,
+      "step": 186
+    },
+    {
+      "epoch": 0.0031334835095015307,
+      "grad_norm": 4.4158406257629395,
+      "learning_rate": 7.110091743119267e-06,
+      "loss": 0.871,
+      "step": 217
+    },
+    {
+      "epoch": 0.003581124010858892,
+      "grad_norm": 4.681333541870117,
+      "learning_rate": 8.125819134993446e-06,
+      "loss": 0.8593,
+      "step": 248
+    },
+    {
+      "epoch": 0.004028764512216254,
+      "grad_norm": 3.8057820796966553,
+      "learning_rate": 9.141546526867629e-06,
+      "loss": 0.8558,
+      "step": 279
+    },
+    {
+      "epoch": 0.0044764050135736155,
+      "grad_norm": 4.523633003234863,
+      "learning_rate": 1.015727391874181e-05,
+      "loss": 0.8676,
+      "step": 310
+    },
+    {
+      "epoch": 0.0049240455149309765,
+      "grad_norm": 3.7387187480926514,
+      "learning_rate": 1.117300131061599e-05,
+      "loss": 0.8585,
+      "step": 341
+    },
+    {
+      "epoch": 0.005371686016288338,
+      "grad_norm": 4.187750816345215,
+      "learning_rate": 1.218872870249017e-05,
+      "loss": 0.8592,
+      "step": 372
+    },
+    {
+      "epoch": 0.005819326517645699,
+      "grad_norm": 3.782883644104004,
+      "learning_rate": 1.3204456094364351e-05,
+      "loss": 0.8449,
+      "step": 403
+    },
+    {
+      "epoch": 0.006266967019003061,
+      "grad_norm": 3.577796459197998,
+      "learning_rate": 1.4220183486238533e-05,
+      "loss": 0.8418,
+      "step": 434
+    },
+    {
+      "epoch": 0.006714607520360423,
+      "grad_norm": 3.1408321857452393,
+      "learning_rate": 1.5235910878112714e-05,
+      "loss": 0.8577,
+      "step": 465
+    },
+    {
+      "epoch": 0.007162248021717784,
+      "grad_norm": 4.090081691741943,
+      "learning_rate": 1.6251638269986893e-05,
+      "loss": 0.8439,
+      "step": 496
+    },
+    {
+      "epoch": 0.007609888523075146,
+      "grad_norm": 2.7458200454711914,
+      "learning_rate": 1.7267365661861077e-05,
+      "loss": 0.8468,
+      "step": 527
+    },
+    {
+      "epoch": 0.008057529024432507,
+      "grad_norm": 3.703225612640381,
+      "learning_rate": 1.8283093053735257e-05,
+      "loss": 0.8385,
+      "step": 558
+    },
+    {
+      "epoch": 0.008505169525789868,
+      "grad_norm": 3.134650230407715,
+      "learning_rate": 1.9298820445609438e-05,
+      "loss": 0.8418,
+      "step": 589
+    },
+    {
+      "epoch": 0.008952810027147231,
+      "grad_norm": 3.762680768966675,
+      "learning_rate": 2.031454783748362e-05,
+      "loss": 0.8312,
+      "step": 620
+    },
+    {
+      "epoch": 0.009400450528504592,
+      "grad_norm": 3.751004457473755,
+      "learning_rate": 2.13302752293578e-05,
+      "loss": 0.8251,
+      "step": 651
+    },
+    {
+      "epoch": 0.009848091029861953,
+      "grad_norm": 3.2268712520599365,
+      "learning_rate": 2.234600262123198e-05,
+      "loss": 0.8369,
+      "step": 682
+    },
+    {
+      "epoch": 0.010295731531219316,
+      "grad_norm": 3.5854289531707764,
+      "learning_rate": 2.336173001310616e-05,
+      "loss": 0.826,
+      "step": 713
+    },
+    {
+      "epoch": 0.010743372032576677,
+      "grad_norm": 3.9910435676574707,
+      "learning_rate": 2.437745740498034e-05,
+      "loss": 0.8168,
+      "step": 744
+    },
+    {
+      "epoch": 0.011191012533934038,
+      "grad_norm": 3.3059303760528564,
+      "learning_rate": 2.5393184796854525e-05,
+      "loss": 0.8269,
+      "step": 775
+    },
+    {
+      "epoch": 0.011638653035291399,
+      "grad_norm": 3.4081811904907227,
+      "learning_rate": 2.6408912188728702e-05,
+      "loss": 0.817,
+      "step": 806
+    },
+    {
+      "epoch": 0.012086293536648762,
+      "grad_norm": 3.2740163803100586,
+      "learning_rate": 2.7424639580602886e-05,
+      "loss": 0.8195,
+      "step": 837
+    },
+    {
+      "epoch": 0.012533934038006123,
+      "grad_norm": 2.7206223011016846,
+      "learning_rate": 2.8440366972477066e-05,
+      "loss": 0.8188,
+      "step": 868
+    },
+    {
+      "epoch": 0.012981574539363484,
+      "grad_norm": 2.7005629539489746,
+      "learning_rate": 2.9456094364351244e-05,
+      "loss": 0.8127,
+      "step": 899
+    },
+    {
+      "epoch": 0.013429215040720846,
+      "grad_norm": 2.970745325088501,
+      "learning_rate": 3.0471821756225428e-05,
+      "loss": 0.8126,
+      "step": 930
+    },
+    {
+      "epoch": 0.013876855542078207,
+      "grad_norm": 2.4761953353881836,
+      "learning_rate": 3.148754914809961e-05,
+      "loss": 0.82,
+      "step": 961
+    },
+    {
+      "epoch": 0.014324496043435568,
+      "grad_norm": 2.8555397987365723,
+      "learning_rate": 3.2503276539973785e-05,
+      "loss": 0.8166,
+      "step": 992
+    },
+    {
+      "epoch": 0.01477213654479293,
+      "grad_norm": 2.8124194145202637,
+      "learning_rate": 3.351900393184797e-05,
+      "loss": 0.8057,
+      "step": 1023
+    },
+    {
+      "epoch": 0.015219777046150292,
+      "grad_norm": 2.353851556777954,
+      "learning_rate": 3.453473132372215e-05,
+      "loss": 0.8064,
+      "step": 1054
+    },
+    {
+      "epoch": 0.015667417547507653,
+      "grad_norm": 3.0127620697021484,
+      "learning_rate": 3.555045871559633e-05,
+      "loss": 0.8086,
+      "step": 1085
+    },
+    {
+      "epoch": 0.016115058048865014,
+      "grad_norm": 2.792686939239502,
+      "learning_rate": 3.6566186107470514e-05,
+      "loss": 0.8152,
+      "step": 1116
+    },
+    {
+      "epoch": 0.016562698550222375,
+      "grad_norm": 2.407134532928467,
+      "learning_rate": 3.7581913499344695e-05,
+      "loss": 0.7949,
+      "step": 1147
+    },
+    {
+      "epoch": 0.017010339051579736,
+      "grad_norm": 2.6921393871307373,
+      "learning_rate": 3.8597640891218876e-05,
+      "loss": 0.804,
+      "step": 1178
+    },
+    {
+      "epoch": 0.0174579795529371,
+      "grad_norm": 2.3015975952148438,
+      "learning_rate": 3.9613368283093056e-05,
+      "loss": 0.7944,
+      "step": 1209
+    },
+    {
+      "epoch": 0.017905620054294462,
+      "grad_norm": 2.8116579055786133,
+      "learning_rate": 4.062909567496724e-05,
+      "loss": 0.7977,
+      "step": 1240
+    },
+    {
+      "epoch": 0.018353260555651823,
+      "grad_norm": 2.5720036029815674,
+      "learning_rate": 4.164482306684142e-05,
+      "loss": 0.7854,
+      "step": 1271
+    },
+    {
+      "epoch": 0.018800901057009184,
+      "grad_norm": 2.0802650451660156,
+      "learning_rate": 4.26605504587156e-05,
+      "loss": 0.7892,
+      "step": 1302
+    },
+    {
+      "epoch": 0.019248541558366545,
+      "grad_norm": 2.4343624114990234,
+      "learning_rate": 4.367627785058978e-05,
+      "loss": 0.7897,
+      "step": 1333
+    },
+    {
+      "epoch": 0.019696182059723906,
+      "grad_norm": 2.509686231613159,
+      "learning_rate": 4.469200524246396e-05,
+      "loss": 0.7855,
+      "step": 1364
+    },
+    {
+      "epoch": 0.020143822561081267,
+      "grad_norm": 2.626512289047241,
+      "learning_rate": 4.570773263433814e-05,
+      "loss": 0.7873,
+      "step": 1395
+    },
+    {
+      "epoch": 0.02059146306243863,
+      "grad_norm": 2.8619399070739746,
+      "learning_rate": 4.672346002621232e-05,
+      "loss": 0.7891,
+      "step": 1426
+    },
+    {
+      "epoch": 0.021039103563795993,
+      "grad_norm": 2.724792718887329,
+      "learning_rate": 4.77391874180865e-05,
+      "loss": 0.782,
+      "step": 1457
+    },
+    {
+      "epoch": 0.021486744065153354,
+      "grad_norm": 2.6659562587738037,
+      "learning_rate": 4.875491480996068e-05,
+      "loss": 0.7856,
+      "step": 1488
+    },
+    {
+      "epoch": 0.021934384566510715,
+      "grad_norm": 2.646078586578369,
+      "learning_rate": 4.977064220183487e-05,
+      "loss": 0.7748,
+      "step": 1519
+    },
+    {
+      "epoch": 0.022382025067868076,
+      "grad_norm": 2.429288387298584,
+      "learning_rate": 4.9999915451558777e-05,
+      "loss": 0.7722,
+      "step": 1550
+    },
+    {
+      "epoch": 0.022829665569225437,
+      "grad_norm": 1.9933409690856934,
+      "learning_rate": 4.999955597496219e-05,
+      "loss": 0.7874,
+      "step": 1581
+    },
+    {
+      "epoch": 0.023277306070582798,
+      "grad_norm": 2.314889907836914,
+      "learning_rate": 4.9998914381774255e-05,
+      "loss": 0.7757,
+      "step": 1612
+    },
+    {
+      "epoch": 0.023724946571940162,
+      "grad_norm": 2.2891199588775635,
+      "learning_rate": 4.999799067923527e-05,
+      "loss": 0.7713,
+      "step": 1643
+    },
+    {
+      "epoch": 0.024172587073297523,
+      "grad_norm": 2.4892444610595703,
+      "learning_rate": 4.999678487776908e-05,
+      "loss": 0.7687,
+      "step": 1674
+    },
+    {
+      "epoch": 0.024620227574654884,
+      "grad_norm": 2.3015685081481934,
+      "learning_rate": 4.9995296990983006e-05,
+      "loss": 0.7721,
+      "step": 1705
+    },
+    {
+      "epoch": 0.025067868076012245,
+      "grad_norm": 2.278954029083252,
+      "learning_rate": 4.999352703566763e-05,
+      "loss": 0.7741,
+      "step": 1736
+    },
+    {
+      "epoch": 0.025515508577369606,
+      "grad_norm": 1.7260370254516602,
+      "learning_rate": 4.999147503179668e-05,
+      "loss": 0.7681,
+      "step": 1767
+    },
+    {
+      "epoch": 0.025963149078726967,
+      "grad_norm": 2.0179309844970703,
+      "learning_rate": 4.998914100252672e-05,
+      "loss": 0.7604,
+      "step": 1798
+    },
+    {
+      "epoch": 0.02641078958008433,
+      "grad_norm": 2.53022837638855,
+      "learning_rate": 4.998652497419696e-05,
+      "loss": 0.7598,
+      "step": 1829
+    },
+    {
+      "epoch": 0.026858430081441693,
+      "grad_norm": 1.859253168106079,
+      "learning_rate": 4.9983626976328927e-05,
+      "loss": 0.7606,
+      "step": 1860
+    },
+    {
+      "epoch": 0.027306070582799054,
+      "grad_norm": 1.759303331375122,
+      "learning_rate": 4.998044704162613e-05,
+      "loss": 0.7532,
+      "step": 1891
+    },
+    {
+      "epoch": 0.027753711084156415,
+      "grad_norm": 2.4389419555664062,
+      "learning_rate": 4.9976985205973705e-05,
+      "loss": 0.7646,
+      "step": 1922
+    },
+    {
+      "epoch": 0.028201351585513776,
+      "grad_norm": 2.155348777770996,
+      "learning_rate": 4.997324150843799e-05,
+      "loss": 0.7569,
+      "step": 1953
+    },
+    {
+      "epoch": 0.028648992086871137,
+      "grad_norm": 2.0138537883758545,
+      "learning_rate": 4.99692159912661e-05,
+      "loss": 0.7677,
+      "step": 1984
+    },
+    {
+      "epoch": 0.029096632588228498,
+      "grad_norm": 2.5275282859802246,
+      "learning_rate": 4.996490869988546e-05,
+      "loss": 0.7519,
+      "step": 2015
+    },
+    {
+      "epoch": 0.02954427308958586,
+      "grad_norm": 1.8147333860397339,
+      "learning_rate": 4.996031968290326e-05,
+      "loss": 0.7509,
+      "step": 2046
+    },
+    {
+      "epoch": 0.029991913590943223,
+      "grad_norm": 2.1941769123077393,
+      "learning_rate": 4.995544899210594e-05,
+      "loss": 0.754,
+      "step": 2077
+    },
+    {
+      "epoch": 0.030439554092300584,
+      "grad_norm": 1.8953059911727905,
+      "learning_rate": 4.9950296682458583e-05,
+      "loss": 0.747,
+      "step": 2108
+    },
+    {
+      "epoch": 0.030887194593657945,
+      "grad_norm": 3.3973031044006348,
+      "learning_rate": 4.994486281210429e-05,
+      "loss": 0.7513,
+      "step": 2139
+    },
+    {
+      "epoch": 0.031334835095015307,
+      "grad_norm": 2.66795015335083,
+      "learning_rate": 4.9939147442363566e-05,
+      "loss": 0.7469,
+      "step": 2170
+    },
+    {
+      "epoch": 0.03178247559637267,
+      "grad_norm": 1.6254230737686157,
+      "learning_rate": 4.9933150637733574e-05,
+      "loss": 0.7297,
+      "step": 2201
+    },
+    {
+      "epoch": 0.03223011609773003,
+      "grad_norm": 1.822745680809021,
+      "learning_rate": 4.992687246588743e-05,
+      "loss": 0.754,
+      "step": 2232
+    },
+    {
+      "epoch": 0.03267775659908739,
+      "grad_norm": 1.6898781061172485,
+      "learning_rate": 4.992031299767347e-05,
+      "loss": 0.7478,
+      "step": 2263
+    },
+    {
+      "epoch": 0.03312539710044475,
+      "grad_norm": 1.799280047416687,
+      "learning_rate": 4.9913472307114386e-05,
+      "loss": 0.746,
+      "step": 2294
+    },
+    {
+      "epoch": 0.033573037601802115,
+      "grad_norm": 2.2501840591430664,
+      "learning_rate": 4.9906350471406446e-05,
+      "loss": 0.7408,
+      "step": 2325
+    },
+    {
+      "epoch": 0.03402067810315947,
+      "grad_norm": 2.3315324783325195,
+      "learning_rate": 4.989894757091861e-05,
+      "loss": 0.7301,
+      "step": 2356
+    },
+    {
+      "epoch": 0.03446831860451684,
+      "grad_norm": 1.5820438861846924,
+      "learning_rate": 4.989126368919158e-05,
+      "loss": 0.7305,
+      "step": 2387
+    },
+    {
+      "epoch": 0.0349159591058742,
+      "grad_norm": 2.5696022510528564,
+      "learning_rate": 4.988329891293693e-05,
+      "loss": 0.7337,
+      "step": 2418
+    },
+    {
+      "epoch": 0.03536359960723156,
+      "grad_norm": 1.8880938291549683,
+      "learning_rate": 4.987505333203608e-05,
+      "loss": 0.7385,
+      "step": 2449
+    },
+    {
+      "epoch": 0.035811240108588924,
+      "grad_norm": 2.6148738861083984,
+      "learning_rate": 4.9866527039539276e-05,
+      "loss": 0.7292,
+      "step": 2480
+    },
+    {
+      "epoch": 0.03625888060994628,
+      "grad_norm": 1.6925290822982788,
+      "learning_rate": 4.9857720131664594e-05,
+      "loss": 0.7344,
+      "step": 2511
+    },
+    {
+      "epoch": 0.036706521111303646,
+      "grad_norm": 1.7675210237503052,
+      "learning_rate": 4.9848632707796773e-05,
+      "loss": 0.7354,
+      "step": 2542
+    },
+    {
+      "epoch": 0.037154161612661,
+      "grad_norm": 2.1053173542022705,
+      "learning_rate": 4.9839264870486155e-05,
+      "loss": 0.7272,
+      "step": 2573
+    },
+    {
+      "epoch": 0.03760180211401837,
+      "grad_norm": 1.9718347787857056,
+      "learning_rate": 4.9829616725447526e-05,
+      "loss": 0.7336,
+      "step": 2604
+    },
+    {
+      "epoch": 0.03804944261537573,
+      "grad_norm": 1.5777671337127686,
+      "learning_rate": 4.981968838155888e-05,
+      "loss": 0.7182,
+      "step": 2635
+    },
+    {
+      "epoch": 0.03849708311673309,
+      "grad_norm": 1.905127763748169,
+      "learning_rate": 4.980947995086024e-05,
+      "loss": 0.7296,
+      "step": 2666
+    },
+    {
+      "epoch": 0.038944723618090454,
+      "grad_norm": 1.63962721824646,
+      "learning_rate": 4.979899154855234e-05,
+      "loss": 0.7249,
+      "step": 2697
+    },
+    {
+      "epoch": 0.03939236411944781,
+      "grad_norm": 1.584331750869751,
+      "learning_rate": 4.9788223292995386e-05,
+      "loss": 0.7345,
+      "step": 2728
+    },
+    {
+      "epoch": 0.039840004620805176,
+      "grad_norm": 1.9111014604568481,
+      "learning_rate": 4.977717530570768e-05,
+      "loss": 0.7225,
+      "step": 2759
+    },
+    {
+      "epoch": 0.040287645122162534,
+      "grad_norm": 2.3216073513031006,
+      "learning_rate": 4.976584771136425e-05,
+      "loss": 0.7207,
+      "step": 2790
+    },
+    {
+      "epoch": 0.0407352856235199,
+      "grad_norm": 1.6002410650253296,
+      "learning_rate": 4.975424063779547e-05,
+      "loss": 0.7228,
+      "step": 2821
+    },
+    {
+      "epoch": 0.04118292612487726,
+      "grad_norm": 2.104731798171997,
+      "learning_rate": 4.974235421598557e-05,
+      "loss": 0.7127,
+      "step": 2852
+    },
+    {
+      "epoch": 0.04163056662623462,
+      "grad_norm": 1.7114660739898682,
+      "learning_rate": 4.973018858007122e-05,
+      "loss": 0.7283,
+      "step": 2883
+    },
+    {
+      "epoch": 0.042078207127591985,
+      "grad_norm": 1.948133945465088,
+      "learning_rate": 4.9717743867339963e-05,
+      "loss": 0.7209,
+      "step": 2914
+    },
+    {
+      "epoch": 0.04252584762894934,
+      "grad_norm": 1.621764898300171,
+      "learning_rate": 4.9705020218228695e-05,
+      "loss": 0.7272,
+      "step": 2945
+    },
+    {
+      "epoch": 0.04297348813030671,
+      "grad_norm": 1.6967558860778809,
+      "learning_rate": 4.969201777632205e-05,
+      "loss": 0.7191,
+      "step": 2976
+    },
+    {
+      "epoch": 0.043421128631664065,
+      "grad_norm": 1.6656996011734009,
+      "learning_rate": 4.9678736688350846e-05,
+      "loss": 0.7205,
+      "step": 3007
+    },
+    {
+      "epoch": 0.04386876913302143,
+      "grad_norm": 2.151475191116333,
+      "learning_rate": 4.966517710419033e-05,
+      "loss": 0.7168,
+      "step": 3038
+    },
+    {
+      "epoch": 0.044316409634378794,
+      "grad_norm": 2.213109016418457,
+      "learning_rate": 4.965133917685858e-05,
+      "loss": 0.7139,
+      "step": 3069
+    },
+    {
+      "epoch": 0.04476405013573615,
+      "grad_norm": 1.5380377769470215,
+      "learning_rate": 4.9637223062514714e-05,
+      "loss": 0.7237,
+      "step": 3100
+    },
+    {
+      "epoch": 0.045211690637093516,
+      "grad_norm": 2.312377452850342,
+      "learning_rate": 4.962282892045718e-05,
+      "loss": 0.7156,
+      "step": 3131
+    },
+    {
+      "epoch": 0.04565933113845087,
+      "grad_norm": 1.7220717668533325,
+      "learning_rate": 4.9608156913121904e-05,
+      "loss": 0.7122,
+      "step": 3162
+    },
+    {
+      "epoch": 0.04610697163980824,
+      "grad_norm": 1.802856206893921,
+      "learning_rate": 4.959320720608049e-05,
+      "loss": 0.7128,
+      "step": 3193
+    },
+    {
+      "epoch": 0.046554612141165595,
+      "grad_norm": 1.6629964113235474,
+      "learning_rate": 4.9577979968038354e-05,
+      "loss": 0.7172,
+      "step": 3224
+    },
+    {
+      "epoch": 0.04700225264252296,
+      "grad_norm": 3.440115213394165,
+      "learning_rate": 4.956247537083282e-05,
+      "loss": 0.7213,
+      "step": 3255
+    },
+    {
+      "epoch": 0.047449893143880324,
+      "grad_norm": 1.5721139907836914,
+      "learning_rate": 4.9546693589431145e-05,
+      "loss": 0.7148,
+      "step": 3286
+    },
+    {
+      "epoch": 0.04789753364523768,
+      "grad_norm": 2.0920398235321045,
+      "learning_rate": 4.9530634801928595e-05,
+      "loss": 0.7145,
+      "step": 3317
+    },
+    {
+      "epoch": 0.048345174146595046,
+      "grad_norm": 1.666566014289856,
+      "learning_rate": 4.9514299189546395e-05,
+      "loss": 0.7095,
+      "step": 3348
+    },
+    {
+      "epoch": 0.048792814647952404,
+      "grad_norm": 1.8222129344940186,
+      "learning_rate": 4.949768693662973e-05,
+      "loss": 0.7138,
+      "step": 3379
+    },
+    {
+      "epoch": 0.04924045514930977,
+      "grad_norm": 1.7302964925765991,
+      "learning_rate": 4.948079823064559e-05,
+      "loss": 0.7017,
+      "step": 3410
+    },
+    {
+      "epoch": 0.049688095650667126,
+      "grad_norm": 1.7338463068008423,
+      "learning_rate": 4.946363326218074e-05,
+      "loss": 0.6979,
+      "step": 3441
+    },
+    {
+      "epoch": 0.05013573615202449,
+      "grad_norm": 1.5637450218200684,
+      "learning_rate": 4.9446192224939525e-05,
+      "loss": 0.7011,
+      "step": 3472
+    },
+    {
+      "epoch": 0.050583376653381855,
+      "grad_norm": 1.5632222890853882,
+      "learning_rate": 4.942847531574167e-05,
+      "loss": 0.704,
+      "step": 3503
+    },
+    {
+      "epoch": 0.05103101715473921,
+      "grad_norm": 1.588402509689331,
+      "learning_rate": 4.941048273452008e-05,
+      "loss": 0.7011,
+      "step": 3534
+    },
+    {
+      "epoch": 0.05147865765609658,
+      "grad_norm": 1.8840582370758057,
+      "learning_rate": 4.9392214684318605e-05,
+      "loss": 0.7016,
+      "step": 3565
+    },
+    {
+      "epoch": 0.051926298157453935,
+      "grad_norm": 1.2702268362045288,
+      "learning_rate": 4.93736713712897e-05,
+      "loss": 0.7004,
+      "step": 3596
+    },
+    {
+      "epoch": 0.0523739386588113,
+      "grad_norm": 1.3812692165374756,
+      "learning_rate": 4.9354853004692124e-05,
+      "loss": 0.7046,
+      "step": 3627
+    },
+    {
+      "epoch": 0.05282157916016866,
+      "grad_norm": 1.7257345914840698,
+      "learning_rate": 4.93357597968886e-05,
+      "loss": 0.6976,
+      "step": 3658
+    },
+    {
+      "epoch": 0.05326921966152602,
+      "grad_norm": 1.7458925247192383,
+      "learning_rate": 4.931639196334338e-05,
+      "loss": 0.6997,
+      "step": 3689
+    },
+    {
+      "epoch": 0.053716860162883386,
+      "grad_norm": 2.1996099948883057,
+      "learning_rate": 4.9296749722619826e-05,
+      "loss": 0.6991,
+      "step": 3720
+    },
+    {
+      "epoch": 0.05416450066424074,
+      "grad_norm": 1.6615021228790283,
+      "learning_rate": 4.9276833296377966e-05,
+      "loss": 0.7005,
+      "step": 3751
+    },
+    {
+      "epoch": 0.05461214116559811,
+      "grad_norm": 1.6276952028274536,
+      "learning_rate": 4.925664290937196e-05,
+      "loss": 0.7097,
+      "step": 3782
+    },
+    {
+      "epoch": 0.055059781666955465,
+      "grad_norm": 1.758227825164795,
+      "learning_rate": 4.9236178789447576e-05,
+      "loss": 0.6955,
+      "step": 3813
+    },
+    {
+      "epoch": 0.05550742216831283,
+      "grad_norm": 1.195280909538269,
+      "learning_rate": 4.921544116753962e-05,
+      "loss": 0.7073,
+      "step": 3844
+    },
+    {
+      "epoch": 0.05595506266967019,
+      "grad_norm": 1.6281015872955322,
+      "learning_rate": 4.919443027766935e-05,
+      "loss": 0.7022,
+      "step": 3875
+    },
+    {
+      "epoch": 0.05640270317102755,
+      "grad_norm": 1.3543150424957275,
+      "learning_rate": 4.91731463569418e-05,
+      "loss": 0.7036,
+      "step": 3906
+    },
+    {
+      "epoch": 0.056850343672384916,
+      "grad_norm": 2.16947078704834,
+      "learning_rate": 4.915158964554312e-05,
+      "loss": 0.7007,
+      "step": 3937
+    },
+    {
+      "epoch": 0.057297984173742274,
+      "grad_norm": 1.324578881263733,
+      "learning_rate": 4.912976038673786e-05,
+      "loss": 0.6941,
+      "step": 3968
+    },
+    {
+      "epoch": 0.05774562467509964,
+      "grad_norm": 1.9811108112335205,
+      "learning_rate": 4.9107658826866254e-05,
+      "loss": 0.6908,
+      "step": 3999
+    },
+    {
+      "epoch": 0.058193265176456996,
+      "grad_norm": 1.2975554466247559,
+      "learning_rate": 4.908528521534139e-05,
+      "loss": 0.6936,
+      "step": 4030
+    },
+    {
+      "epoch": 0.05864090567781436,
+      "grad_norm": 1.583282232284546,
+      "learning_rate": 4.906263980464644e-05,
+      "loss": 0.698,
+      "step": 4061
+    },
+    {
+      "epoch": 0.05908854617917172,
+      "grad_norm": 1.3532944917678833,
+      "learning_rate": 4.903972285033178e-05,
+      "loss": 0.7049,
+      "step": 4092
+    },
+    {
+      "epoch": 0.05953618668052908,
+      "grad_norm": 2.1245481967926025,
+      "learning_rate": 4.901653461101213e-05,
+      "loss": 0.7016,
+      "step": 4123
+    },
+    {
+      "epoch": 0.05998382718188645,
+      "grad_norm": 1.6913797855377197,
+      "learning_rate": 4.8993075348363626e-05,
+      "loss": 0.6981,
+      "step": 4154
+    },
+    {
+      "epoch": 0.060431467683243804,
+      "grad_norm": 1.51249098777771,
+      "learning_rate": 4.896934532712084e-05,
+      "loss": 0.6955,
+      "step": 4185
+    },
+    {
+      "epoch": 0.06087910818460117,
+      "grad_norm": 1.3880395889282227,
+      "learning_rate": 4.8945344815073846e-05,
+      "loss": 0.6934,
+      "step": 4216
+    },
+    {
+      "epoch": 0.061326748685958526,
+      "grad_norm": 1.6354159116744995,
+      "learning_rate": 4.892107408306516e-05,
+      "loss": 0.6938,
+      "step": 4247
+    },
+    {
+      "epoch": 0.06177438918731589,
+      "grad_norm": 2.126742362976074,
+      "learning_rate": 4.889653340498669e-05,
+      "loss": 0.7003,
+      "step": 4278
+    },
+    {
+      "epoch": 0.06222202968867325,
+      "grad_norm": 1.7903707027435303,
+      "learning_rate": 4.8871723057776664e-05,
+      "loss": 0.6885,
+      "step": 4309
+    },
+    {
+      "epoch": 0.06266967019003061,
+      "grad_norm": 1.537806510925293,
+      "learning_rate": 4.8846643321416476e-05,
+      "loss": 0.6892,
+      "step": 4340
+    },
+    {
+      "epoch": 0.06311731069138797,
+      "grad_norm": 1.6445434093475342,
+      "learning_rate": 4.882129447892753e-05,
+      "loss": 0.6843,
+      "step": 4371
+    },
+    {
+      "epoch": 0.06356495119274534,
+      "grad_norm": 1.555373191833496,
+      "learning_rate": 4.8795676816368076e-05,
+      "loss": 0.6899,
+      "step": 4402
+    },
+    {
+      "epoch": 0.0640125916941027,
+      "grad_norm": 1.8370277881622314,
+      "learning_rate": 4.876979062282995e-05,
+      "loss": 0.6813,
+      "step": 4433
+    },
+    {
+      "epoch": 0.06446023219546006,
+      "grad_norm": 1.3132514953613281,
+      "learning_rate": 4.8743636190435325e-05,
+      "loss": 0.6832,
+      "step": 4464
+    },
+    {
+      "epoch": 0.06490787269681741,
+      "grad_norm": 1.3186298608779907,
+      "learning_rate": 4.871721381433344e-05,
+      "loss": 0.6879,
+      "step": 4495
+    },
+    {
+      "epoch": 0.06535551319817479,
+      "grad_norm": 1.4360268115997314,
+      "learning_rate": 4.869052379269719e-05,
+      "loss": 0.69,
+      "step": 4526
+    },
+    {
+      "epoch": 0.06580315369953214,
+      "grad_norm": 1.670765995979309,
+      "learning_rate": 4.866356642671985e-05,
+      "loss": 0.6865,
+      "step": 4557
+    },
+    {
+      "epoch": 0.0662507942008895,
+      "grad_norm": 1.7548723220825195,
+      "learning_rate": 4.8636342020611634e-05,
+      "loss": 0.6852,
+      "step": 4588
+    },
+    {
+      "epoch": 0.06669843470224687,
+      "grad_norm": 1.5086426734924316,
+      "learning_rate": 4.860885088159626e-05,
+      "loss": 0.6894,
+      "step": 4619
+    },
+    {
+      "epoch": 0.06714607520360423,
+      "grad_norm": 1.3140665292739868,
+      "learning_rate": 4.858109331990751e-05,
+      "loss": 0.6812,
+      "step": 4650
+    },
+    {
+      "epoch": 0.06759371570496159,
+      "grad_norm": 1.4212454557418823,
+      "learning_rate": 4.855306964878567e-05,
+      "loss": 0.6872,
+      "step": 4681
+    },
+    {
+      "epoch": 0.06804135620631895,
+      "grad_norm": 1.3034414052963257,
+      "learning_rate": 4.8524780184474084e-05,
+      "loss": 0.6901,
+      "step": 4712
+    },
+    {
+      "epoch": 0.06848899670767632,
+      "grad_norm": 1.3741438388824463,
+      "learning_rate": 4.8496225246215496e-05,
+      "loss": 0.6875,
+      "step": 4743
+    },
+    {
+      "epoch": 0.06893663720903367,
+      "grad_norm": 1.7262542247772217,
+      "learning_rate": 4.8467405156248505e-05,
+      "loss": 0.6868,
+      "step": 4774
+    },
+    {
+      "epoch": 0.06938427771039103,
+      "grad_norm": 1.3293650150299072,
+      "learning_rate": 4.843832023980392e-05,
+      "loss": 0.6891,
+      "step": 4805
+    },
+    {
+      "epoch": 0.0698319182117484,
+      "grad_norm": 1.3448151350021362,
+      "learning_rate": 4.840897082510106e-05,
+      "loss": 0.6765,
+      "step": 4836
+    },
+    {
+      "epoch": 0.07027955871310576,
+      "grad_norm": 2.961280584335327,
+      "learning_rate": 4.8379357243344084e-05,
+      "loss": 0.6939,
+      "step": 4867
+    },
+    {
+      "epoch": 0.07072719921446312,
+      "grad_norm": 1.8265361785888672,
+      "learning_rate": 4.8349479828718236e-05,
+      "loss": 0.677,
+      "step": 4898
+    },
+    {
+      "epoch": 0.07117483971582048,
+      "grad_norm": 1.490349531173706,
+      "learning_rate": 4.8319338918386075e-05,
+      "loss": 0.6778,
+      "step": 4929
+    },
+    {
+      "epoch": 0.07162248021717785,
+      "grad_norm": 1.3669307231903076,
+      "learning_rate": 4.828893485248369e-05,
+      "loss": 0.6746,
+      "step": 4960
+    },
+    {
+      "epoch": 0.0720701207185352,
+      "grad_norm": 1.3995884656906128,
+      "learning_rate": 4.825826797411682e-05,
+      "loss": 0.6757,
+      "step": 4991
+    },
+    {
+      "epoch": 0.07251776121989256,
+      "grad_norm": 1.1217372417449951,
+      "learning_rate": 4.822733862935702e-05,
+      "loss": 0.6832,
+      "step": 5022
+    },
+    {
+      "epoch": 0.07296540172124993,
+      "grad_norm": 1.2192097902297974,
+      "learning_rate": 4.819614716723775e-05,
+      "loss": 0.6868,
+      "step": 5053
+    },
+    {
+      "epoch": 0.07341304222260729,
+      "grad_norm": 1.5045067071914673,
+      "learning_rate": 4.8164693939750425e-05,
+      "loss": 0.6793,
+      "step": 5084
+    },
+    {
+      "epoch": 0.07386068272396465,
+      "grad_norm": 1.7127234935760498,
+      "learning_rate": 4.813297930184042e-05,
+      "loss": 0.6797,
+      "step": 5115
+    },
+    {
+      "epoch": 0.074308323225322,
+      "grad_norm": 1.846561312675476,
+      "learning_rate": 4.810100361140314e-05,
+      "loss": 0.6767,
+      "step": 5146
+    },
+    {
+      "epoch": 0.07475596372667938,
+      "grad_norm": 1.3076797723770142,
+      "learning_rate": 4.8068767229279885e-05,
+      "loss": 0.6855,
+      "step": 5177
+    },
+    {
+      "epoch": 0.07520360422803674,
+      "grad_norm": 1.4170383214950562,
+      "learning_rate": 4.8036270519253854e-05,
+      "loss": 0.681,
+      "step": 5208
+    },
+    {
+      "epoch": 0.0756512447293941,
+      "grad_norm": 1.2504942417144775,
+      "learning_rate": 4.8003513848046e-05,
+      "loss": 0.6778,
+      "step": 5239
+    },
+    {
+      "epoch": 0.07609888523075146,
+      "grad_norm": 1.1522283554077148,
+      "learning_rate": 4.79704975853109e-05,
+      "loss": 0.6749,
+      "step": 5270
+    },
+    {
+      "epoch": 0.07654652573210882,
+      "grad_norm": 1.6351525783538818,
+      "learning_rate": 4.793722210363262e-05,
+      "loss": 0.6745,
+      "step": 5301
+    },
+    {
+      "epoch": 0.07699416623346618,
+      "grad_norm": 1.5093014240264893,
+      "learning_rate": 4.7903687778520414e-05,
+      "loss": 0.6747,
+      "step": 5332
+    },
+    {
+      "epoch": 0.07744180673482354,
+      "grad_norm": 1.362160563468933,
+      "learning_rate": 4.7869894988404593e-05,
+      "loss": 0.673,
+      "step": 5363
+    },
+    {
+      "epoch": 0.07788944723618091,
+      "grad_norm": 1.2021727561950684,
+      "learning_rate": 4.783584411463221e-05,
+      "loss": 0.6768,
+      "step": 5394
+    },
+    {
+      "epoch": 0.07833708773753827,
+      "grad_norm": 2.1543540954589844,
+      "learning_rate": 4.780153554146274e-05,
+      "loss": 0.672,
+      "step": 5425
+    },
+    {
+      "epoch": 0.07878472823889562,
+      "grad_norm": 1.882712721824646,
+      "learning_rate": 4.7766969656063766e-05,
+      "loss": 0.6926,
+      "step": 5456
+    },
+    {
+      "epoch": 0.079232368740253,
+      "grad_norm": 1.3975650072097778,
+      "learning_rate": 4.773214684850662e-05,
+      "loss": 0.6747,
+      "step": 5487
+    },
+    {
+      "epoch": 0.07968000924161035,
+      "grad_norm": 1.3912913799285889,
+      "learning_rate": 4.769706751176193e-05,
+      "loss": 0.6756,
+      "step": 5518
+    },
+    {
+      "epoch": 0.08012764974296771,
+      "grad_norm": 1.7227635383605957,
+      "learning_rate": 4.7661732041695264e-05,
+      "loss": 0.6694,
+      "step": 5549
+    },
+    {
+      "epoch": 0.08057529024432507,
+      "grad_norm": 1.3151129484176636,
+      "learning_rate": 4.762614083706258e-05,
+      "loss": 0.6715,
+      "step": 5580
+    },
+    {
+      "epoch": 0.08102293074568244,
+      "grad_norm": 1.0972425937652588,
+      "learning_rate": 4.759029429950581e-05,
+      "loss": 0.6661,
+      "step": 5611
+    },
+    {
+      "epoch": 0.0814705712470398,
+      "grad_norm": 1.2346575260162354,
+      "learning_rate": 4.7554192833548235e-05,
+      "loss": 0.66,
+      "step": 5642
+    },
+    {
+      "epoch": 0.08191821174839715,
+      "grad_norm": 1.4536516666412354,
+      "learning_rate": 4.751783684659e-05,
+      "loss": 0.6743,
+      "step": 5673
+    },
+    {
+      "epoch": 0.08236585224975453,
+      "grad_norm": 1.1361631155014038,
+      "learning_rate": 4.748122674890348e-05,
+      "loss": 0.6791,
+      "step": 5704
+    },
+    {
+      "epoch": 0.08281349275111188,
+      "grad_norm": 1.2605111598968506,
+      "learning_rate": 4.7444362953628654e-05,
+      "loss": 0.6797,
+      "step": 5735
+    },
+    {
+      "epoch": 0.08326113325246924,
+      "grad_norm": 1.2355903387069702,
+      "learning_rate": 4.7407245876768424e-05,
+      "loss": 0.6642,
+      "step": 5766
+    },
+    {
+      "epoch": 0.0837087737538266,
+      "grad_norm": 1.6677048206329346,
+      "learning_rate": 4.736987593718397e-05,
+      "loss": 0.6759,
+      "step": 5797
+    },
+    {
+      "epoch": 0.08415641425518397,
+      "grad_norm": 1.4781981706619263,
+      "learning_rate": 4.733225355658999e-05,
+      "loss": 0.6707,
+      "step": 5828
+    },
+    {
+      "epoch": 0.08460405475654133,
+      "grad_norm": 1.138583779335022,
+      "learning_rate": 4.7294379159549926e-05,
+      "loss": 0.6636,
+      "step": 5859
+    },
+    {
+      "epoch": 0.08505169525789869,
+      "grad_norm": 1.529036283493042,
+      "learning_rate": 4.725625317347119e-05,
+      "loss": 0.6705,
+      "step": 5890
+    },
+    {
+      "epoch": 0.08549933575925606,
+      "grad_norm": 1.3216760158538818,
+      "learning_rate": 4.7217876028600374e-05,
+      "loss": 0.6714,
+      "step": 5921
+    },
+    {
+      "epoch": 0.08594697626061341,
+      "grad_norm": 1.1820168495178223,
+      "learning_rate": 4.717924815801832e-05,
+      "loss": 0.6757,
+      "step": 5952
+    },
+    {
+      "epoch": 0.08639461676197077,
+      "grad_norm": 1.393571138381958,
+      "learning_rate": 4.714036999763532e-05,
+      "loss": 0.6672,
+      "step": 5983
+    },
+    {
+      "epoch": 0.08684225726332813,
+      "grad_norm": 1.4574682712554932,
+      "learning_rate": 4.7101241986186116e-05,
+      "loss": 0.6655,
+      "step": 6014
+    },
+    {
+      "epoch": 0.0872898977646855,
+      "grad_norm": 1.138645887374878,
+      "learning_rate": 4.7061864565225e-05,
+      "loss": 0.6663,
+      "step": 6045
+    },
+    {
+      "epoch": 0.08773753826604286,
+      "grad_norm": 1.7602777481079102,
+      "learning_rate": 4.702223817912081e-05,
+      "loss": 0.6695,
+      "step": 6076
+    },
+    {
+      "epoch": 0.08818517876740022,
+      "grad_norm": 1.2323459386825562,
+      "learning_rate": 4.698236327505195e-05,
+      "loss": 0.6636,
+      "step": 6107
+    },
+    {
+      "epoch": 0.08863281926875759,
+      "grad_norm": 1.6881431341171265,
+      "learning_rate": 4.694224030300127e-05,
+      "loss": 0.6653,
+      "step": 6138
+    },
+    {
+      "epoch": 0.08908045977011494,
+      "grad_norm": 1.391417384147644,
+      "learning_rate": 4.690186971575107e-05,
+      "loss": 0.6636,
+      "step": 6169
+    },
+    {
+      "epoch": 0.0895281002714723,
+      "grad_norm": 1.3066257238388062,
+      "learning_rate": 4.6861251968877916e-05,
+      "loss": 0.6777,
+      "step": 6200
+    },
+    {
+      "epoch": 0.08997574077282966,
+      "grad_norm": 1.2001326084136963,
+      "learning_rate": 4.68203875207476e-05,
+      "loss": 0.6683,
+      "step": 6231
+    },
+    {
+      "epoch": 0.09042338127418703,
+      "grad_norm": 1.4361172914505005,
+      "learning_rate": 4.677927683250983e-05,
+      "loss": 0.6634,
+      "step": 6262
+    },
+    {
+      "epoch": 0.09087102177554439,
+      "grad_norm": 8.04520320892334,
+      "learning_rate": 4.6737920368093156e-05,
+      "loss": 0.6752,
+      "step": 6293
+    },
+    {
+      "epoch": 0.09131866227690175,
+      "grad_norm": 1.4874210357666016,
+      "learning_rate": 4.669631859419965e-05,
+      "loss": 0.6733,
+      "step": 6324
+    },
+    {
+      "epoch": 0.09176630277825912,
+      "grad_norm": 1.234491229057312,
+      "learning_rate": 4.6654471980299676e-05,
+      "loss": 0.668,
+      "step": 6355
+    },
+    {
+      "epoch": 0.09221394327961648,
+      "grad_norm": 1.2088687419891357,
+      "learning_rate": 4.661238099862658e-05,
+      "loss": 0.6705,
+      "step": 6386
+    },
+    {
+      "epoch": 0.09266158378097383,
+      "grad_norm": 1.1937814950942993,
+      "learning_rate": 4.657004612417138e-05,
+      "loss": 0.6853,
+      "step": 6417
+    },
+    {
+      "epoch": 0.09310922428233119,
+      "grad_norm": 1.5205374956130981,
+      "learning_rate": 4.6527467834677374e-05,
+      "loss": 0.685,
+      "step": 6448
+    },
+    {
+      "epoch": 0.09355686478368856,
+      "grad_norm": 1.2221660614013672,
+      "learning_rate": 4.648464661063478e-05,
+      "loss": 0.6622,
+      "step": 6479
+    },
+    {
+      "epoch": 0.09400450528504592,
+      "grad_norm": 1.0762608051300049,
+      "learning_rate": 4.6441582935275264e-05,
+      "loss": 0.669,
+      "step": 6510
+    },
+    {
+      "epoch": 0.09445214578640328,
+      "grad_norm": 1.4416946172714233,
+      "learning_rate": 4.6398277294566586e-05,
+      "loss": 0.6674,
+      "step": 6541
+    },
+    {
+      "epoch": 0.09489978628776065,
+      "grad_norm": 1.559158205986023,
+      "learning_rate": 4.6354730177207e-05,
+      "loss": 0.6681,
+      "step": 6572
+    },
+    {
+      "epoch": 0.095347426789118,
+      "grad_norm": 1.3833891153335571,
+      "learning_rate": 4.6310942074619787e-05,
+      "loss": 0.6681,
+      "step": 6603
+    },
+    {
+      "epoch": 0.09579506729047536,
+      "grad_norm": 1.6753300428390503,
+      "learning_rate": 4.626691348094777e-05,
+      "loss": 0.6658,
+      "step": 6634
+    },
+    {
+      "epoch": 0.09624270779183272,
+      "grad_norm": 1.951198697090149,
+      "learning_rate": 4.622264489304762e-05,
+      "loss": 0.6654,
+      "step": 6665
+    },
+    {
+      "epoch": 0.09669034829319009,
+      "grad_norm": 1.2356919050216675,
+      "learning_rate": 4.617813681048434e-05,
+      "loss": 0.6651,
+      "step": 6696
+    },
+    {
+      "epoch": 0.09713798879454745,
+      "grad_norm": 1.2712593078613281,
+      "learning_rate": 4.61333897355256e-05,
+      "loss": 0.6646,
+      "step": 6727
+    },
+    {
+      "epoch": 0.09758562929590481,
+      "grad_norm": 1.1935900449752808,
+      "learning_rate": 4.608840417313604e-05,
+      "loss": 0.674,
+      "step": 6758
+    },
+    {
+      "epoch": 0.09803326979726218,
+      "grad_norm": 1.1649430990219116,
+      "learning_rate": 4.6043180630971646e-05,
+      "loss": 0.6644,
+      "step": 6789
+    },
+    {
+      "epoch": 0.09848091029861954,
+      "grad_norm": 1.4281456470489502,
+      "learning_rate": 4.599771961937391e-05,
+      "loss": 0.6673,
+      "step": 6820
+    },
+    {
+      "epoch": 0.0989285507999769,
+      "grad_norm": 1.3064521551132202,
+      "learning_rate": 4.5952021651364204e-05,
+      "loss": 0.6584,
+      "step": 6851
+    },
+    {
+      "epoch": 0.09937619130133425,
+      "grad_norm": 1.2546554803848267,
+      "learning_rate": 4.590608724263786e-05,
+      "loss": 0.6612,
+      "step": 6882
+    },
+    {
+      "epoch": 0.09982383180269162,
+      "grad_norm": 1.1866974830627441,
+      "learning_rate": 4.585991691155845e-05,
+      "loss": 0.6612,
+      "step": 6913
+    },
+    {
+      "epoch": 0.10027147230404898,
+      "grad_norm": 1.6166640520095825,
+      "learning_rate": 4.581351117915188e-05,
+      "loss": 0.6551,
+      "step": 6944
+    },
+    {
+      "epoch": 0.10071911280540634,
+      "grad_norm": 1.5471700429916382,
+      "learning_rate": 4.5766870569100534e-05,
+      "loss": 0.6607,
+      "step": 6975
+    },
+    {
+      "epoch": 0.10116675330676371,
+      "grad_norm": 1.3361026048660278,
+      "learning_rate": 4.571999560773736e-05,
+      "loss": 0.666,
+      "step": 7006
+    },
+    {
+      "epoch": 0.10161439380812107,
+      "grad_norm": 1.2938140630722046,
+      "learning_rate": 4.5672886824039915e-05,
+      "loss": 0.6547,
+      "step": 7037
+    },
+    {
+      "epoch": 0.10206203430947842,
+      "grad_norm": 1.2688400745391846,
+      "learning_rate": 4.5625544749624435e-05,
+      "loss": 0.6624,
+      "step": 7068
+    },
+    {
+      "epoch": 0.10250967481083578,
+      "grad_norm": 1.6306285858154297,
+      "learning_rate": 4.5577969918739794e-05,
+      "loss": 0.6627,
+      "step": 7099
+    },
+    {
+      "epoch": 0.10295731531219315,
+      "grad_norm": 1.3346176147460938,
+      "learning_rate": 4.5530162868261486e-05,
+      "loss": 0.6577,
+      "step": 7130
+    },
+    {
+      "epoch": 0.10340495581355051,
+      "grad_norm": 1.0933984518051147,
+      "learning_rate": 4.548212413768558e-05,
+      "loss": 0.6602,
+      "step": 7161
+    },
+    {
+      "epoch": 0.10385259631490787,
+      "grad_norm": 1.575859785079956,
+      "learning_rate": 4.543385426912261e-05,
+      "loss": 0.6593,
+      "step": 7192
+    },
+    {
+      "epoch": 0.10430023681626524,
+      "grad_norm": 1.4265861511230469,
+      "learning_rate": 4.53853538072915e-05,
+      "loss": 0.6564,
+      "step": 7223
+    },
+    {
+      "epoch": 0.1047478773176226,
+      "grad_norm": 1.737012267112732,
+      "learning_rate": 4.533662329951336e-05,
+      "loss": 0.6593,
+      "step": 7254
+    },
+    {
+      "epoch": 0.10519551781897996,
+      "grad_norm": 1.0257115364074707,
+      "learning_rate": 4.528766329570536e-05,
+      "loss": 0.6514,
+      "step": 7285
+    },
+    {
+      "epoch": 0.10564315832033731,
+      "grad_norm": 1.5043773651123047,
+      "learning_rate": 4.523847434837447e-05,
+      "loss": 0.6635,
+      "step": 7316
+    },
+    {
+      "epoch": 0.10609079882169468,
+      "grad_norm": 1.5642234086990356,
+      "learning_rate": 4.518905701261128e-05,
+      "loss": 0.6558,
+      "step": 7347
+    },
+    {
+      "epoch": 0.10653843932305204,
+      "grad_norm": 1.1821067333221436,
+      "learning_rate": 4.5139411846083715e-05,
+      "loss": 0.6686,
+      "step": 7378
+    },
+    {
+      "epoch": 0.1069860798244094,
+      "grad_norm": 1.5492759943008423,
+      "learning_rate": 4.508953940903073e-05,
+      "loss": 0.6543,
+      "step": 7409
+    },
+    {
+      "epoch": 0.10743372032576677,
+      "grad_norm": 1.281914234161377,
+      "learning_rate": 4.5039440264255994e-05,
+      "loss": 0.6516,
+      "step": 7440
+    },
+    {
+      "epoch": 0.10788136082712413,
+      "grad_norm": 1.3318305015563965,
+      "learning_rate": 4.498911497712155e-05,
+      "loss": 0.656,
+      "step": 7471
+    },
+    {
+      "epoch": 0.10832900132848149,
+      "grad_norm": 1.3832449913024902,
+      "learning_rate": 4.493856411554142e-05,
+      "loss": 0.6475,
+      "step": 7502
+    },
+    {
+      "epoch": 0.10877664182983884,
+      "grad_norm": 1.3547158241271973,
+      "learning_rate": 4.4887788249975206e-05,
+      "loss": 0.6594,
+      "step": 7533
+    },
+    {
+      "epoch": 0.10922428233119622,
+      "grad_norm": 1.4633681774139404,
+      "learning_rate": 4.4836787953421656e-05,
+      "loss": 0.6707,
+      "step": 7564
+    },
+    {
+      "epoch": 0.10967192283255357,
+      "grad_norm": 1.1781059503555298,
+      "learning_rate": 4.478556380141218e-05,
+      "loss": 0.6626,
+      "step": 7595
+    },
+    {
+      "epoch": 0.11011956333391093,
+      "grad_norm": 1.4727883338928223,
+      "learning_rate": 4.4734116372004375e-05,
+      "loss": 0.6535,
+      "step": 7626
+    },
+    {
+      "epoch": 0.1105672038352683,
+      "grad_norm": 1.3888640403747559,
+      "learning_rate": 4.4682446245775477e-05,
+      "loss": 0.6606,
+      "step": 7657
+    },
+    {
+      "epoch": 0.11101484433662566,
+      "grad_norm": 1.308769941329956,
+      "learning_rate": 4.463055400581586e-05,
+      "loss": 0.6667,
+      "step": 7688
+    },
+    {
+      "epoch": 0.11146248483798302,
+      "grad_norm": 1.3579630851745605,
+      "learning_rate": 4.4578440237722374e-05,
+      "loss": 0.6621,
+      "step": 7719
+    },
+    {
+      "epoch": 0.11191012533934037,
+      "grad_norm": 1.1285645961761475,
+      "learning_rate": 4.452610552959183e-05,
+      "loss": 0.6597,
+      "step": 7750
+    },
+    {
+      "epoch": 0.11235776584069775,
+      "grad_norm": 1.1144675016403198,
+      "learning_rate": 4.447355047201428e-05,
+      "loss": 0.6638,
+      "step": 7781
+    },
+    {
+      "epoch": 0.1128054063420551,
+      "grad_norm": 1.1993658542633057,
+      "learning_rate": 4.4420775658066414e-05,
+      "loss": 0.6704,
+      "step": 7812
+    },
+    {
+      "epoch": 0.11325304684341246,
+      "grad_norm": 1.0608967542648315,
+      "learning_rate": 4.436778168330484e-05,
+      "loss": 0.6573,
+      "step": 7843
+    },
+    {
+      "epoch": 0.11370068734476983,
+      "grad_norm": 1.1210070848464966,
+      "learning_rate": 4.4314569145759353e-05,
+      "loss": 0.6612,
+      "step": 7874
+    },
+    {
+      "epoch": 0.11414832784612719,
+      "grad_norm": 1.2345409393310547,
+      "learning_rate": 4.42611386459262e-05,
+      "loss": 0.65,
+      "step": 7905
+    },
+    {
+      "epoch": 0.11459596834748455,
+      "grad_norm": 1.077025294303894,
+      "learning_rate": 4.420749078676133e-05,
+      "loss": 0.6595,
+      "step": 7936
+    },
+    {
+      "epoch": 0.1150436088488419,
+      "grad_norm": 1.2079277038574219,
+      "learning_rate": 4.4153626173673516e-05,
+      "loss": 0.6442,
+      "step": 7967
+    },
+    {
+      "epoch": 0.11549124935019928,
+      "grad_norm": 1.6710035800933838,
+      "learning_rate": 4.409954541451762e-05,
+      "loss": 0.663,
+      "step": 7998
+    },
+    {
+      "epoch": 0.11593888985155663,
+      "grad_norm": 1.3124401569366455,
+      "learning_rate": 4.404524911958764e-05,
+      "loss": 0.6512,
+      "step": 8029
+    },
+    {
+      "epoch": 0.11638653035291399,
+      "grad_norm": 1.644904375076294,
+      "learning_rate": 4.399073790160989e-05,
+      "loss": 0.6587,
+      "step": 8060
+    },
+    {
+      "epoch": 0.11683417085427136,
+      "grad_norm": 1.181624174118042,
+      "learning_rate": 4.393601237573607e-05,
+      "loss": 0.653,
+      "step": 8091
+    },
+    {
+      "epoch": 0.11728181135562872,
+      "grad_norm": 1.4587918519973755,
+      "learning_rate": 4.388107315953628e-05,
+      "loss": 0.675,
+      "step": 8122
+    },
+    {
+      "epoch": 0.11772945185698608,
+      "grad_norm": 1.2147635221481323,
+      "learning_rate": 4.382592087299212e-05,
+      "loss": 0.6521,
+      "step": 8153
+    },
+    {
+      "epoch": 0.11817709235834344,
+      "grad_norm": 1.0448981523513794,
+      "learning_rate": 4.377055613848964e-05,
+      "loss": 0.6541,
+      "step": 8184
+    },
+    {
+      "epoch": 0.11862473285970081,
+      "grad_norm": 1.4482290744781494,
+      "learning_rate": 4.3714979580812355e-05,
+      "loss": 0.6563,
+      "step": 8215
+    },
+    {
+      "epoch": 0.11907237336105816,
+      "grad_norm": 1.1621575355529785,
+      "learning_rate": 4.365919182713416e-05,
+      "loss": 0.656,
+      "step": 8246
+    },
+    {
+      "epoch": 0.11952001386241552,
+      "grad_norm": 1.1643873453140259,
+      "learning_rate": 4.360319350701226e-05,
+      "loss": 0.6547,
+      "step": 8277
+    },
+    {
+      "epoch": 0.1199676543637729,
+      "grad_norm": 1.4016129970550537,
+      "learning_rate": 4.3546985252380115e-05,
+      "loss": 0.6582,
+      "step": 8308
+    },
+    {
+      "epoch": 0.12041529486513025,
+      "grad_norm": 1.4023685455322266,
+      "learning_rate": 4.349056769754021e-05,
+      "loss": 0.6621,
+      "step": 8339
+    },
+    {
+      "epoch": 0.12086293536648761,
+      "grad_norm": 1.3020285367965698,
+      "learning_rate": 4.3433941479156994e-05,
+      "loss": 0.6674,
+      "step": 8370
+    },
+    {
+      "epoch": 0.12131057586784497,
+      "grad_norm": 1.2162435054779053,
+      "learning_rate": 4.3377107236249647e-05,
+      "loss": 0.6614,
+      "step": 8401
+    },
+    {
+      "epoch": 0.12175821636920234,
+      "grad_norm": 1.1956969499588013,
+      "learning_rate": 4.332006561018488e-05,
+      "loss": 0.6557,
+      "step": 8432
+    },
+    {
+      "epoch": 0.1222058568705597,
+      "grad_norm": 1.1723664999008179,
+      "learning_rate": 4.3262817244669683e-05,
+      "loss": 0.6633,
+      "step": 8463
+    },
+    {
+      "epoch": 0.12265349737191705,
+      "grad_norm": 1.113020658493042,
+      "learning_rate": 4.3205362785744083e-05,
+      "loss": 0.6577,
+      "step": 8494
+    },
+    {
+      "epoch": 0.12310113787327442,
+      "grad_norm": 1.2453004121780396,
+      "learning_rate": 4.314770288177384e-05,
+      "loss": 0.6544,
+      "step": 8525
+    },
+    {
+      "epoch": 0.12354877837463178,
+      "grad_norm": 1.1493890285491943,
+      "learning_rate": 4.308983818344313e-05,
+      "loss": 0.6533,
+      "step": 8556
+    },
+    {
+      "epoch": 0.12399641887598914,
+      "grad_norm": 1.4172496795654297,
+      "learning_rate": 4.3031769343747206e-05,
+      "loss": 0.6542,
+      "step": 8587
+    },
+    {
+      "epoch": 0.1244440593773465,
+      "grad_norm": 1.1840728521347046,
+      "learning_rate": 4.297349701798505e-05,
+      "loss": 0.6476,
+      "step": 8618
+    },
+    {
+      "epoch": 0.12489169987870387,
+      "grad_norm": 1.3720282316207886,
+      "learning_rate": 4.2915021863751916e-05,
+      "loss": 0.6446,
+      "step": 8649
+    },
+    {
+      "epoch": 0.12533934038006123,
+      "grad_norm": 1.1705596446990967,
+      "learning_rate": 4.285634454093198e-05,
+      "loss": 0.6537,
+      "step": 8680
+    },
+    {
+      "epoch": 0.1257869808814186,
+      "grad_norm": 1.0790083408355713,
+      "learning_rate": 4.279746571169086e-05,
+      "loss": 0.6543,
+      "step": 8711
+    },
+    {
+      "epoch": 0.12623462138277594,
+      "grad_norm": 1.1207470893859863,
+      "learning_rate": 4.2738386040468136e-05,
+      "loss": 0.6468,
+      "step": 8742
+    },
+    {
+      "epoch": 0.1266822618841333,
+      "grad_norm": 1.1123065948486328,
+      "learning_rate": 4.2679106193969866e-05,
+      "loss": 0.6596,
+      "step": 8773
+    },
+    {
+      "epoch": 0.12712990238549068,
+      "grad_norm": 1.1579636335372925,
+      "learning_rate": 4.261962684116106e-05,
+      "loss": 0.6458,
+      "step": 8804
+    },
+    {
+      "epoch": 0.12757754288684803,
+      "grad_norm": 1.3112802505493164,
+      "learning_rate": 4.2559948653258145e-05,
+      "loss": 0.6483,
+      "step": 8835
+    },
+    {
+      "epoch": 0.1280251833882054,
+      "grad_norm": 1.1104832887649536,
+      "learning_rate": 4.250007230372134e-05,
+      "loss": 0.645,
+      "step": 8866
+    },
+    {
+      "epoch": 0.12847282388956274,
+      "grad_norm": 1.0218713283538818,
+      "learning_rate": 4.2439998468247126e-05,
+      "loss": 0.6519,
+      "step": 8897
+    },
+    {
+      "epoch": 0.12892046439092011,
+      "grad_norm": 1.0053678750991821,
+      "learning_rate": 4.2379727824760566e-05,
+      "loss": 0.6468,
+      "step": 8928
+    },
+    {
+      "epoch": 0.12936810489227749,
+      "grad_norm": 1.410933017730713,
+      "learning_rate": 4.231926105340768e-05,
+      "loss": 0.6573,
+      "step": 8959
+    },
+    {
+      "epoch": 0.12981574539363483,
+      "grad_norm": 1.5001798868179321,
+      "learning_rate": 4.225859883654776e-05,
+      "loss": 0.6483,
+      "step": 8990
+    },
+    {
+      "epoch": 0.1302633858949922,
+      "grad_norm": 1.112316608428955,
+      "learning_rate": 4.219774185874569e-05,
+      "loss": 0.6483,
+      "step": 9021
+    },
+    {
+      "epoch": 0.13071102639634957,
+      "grad_norm": 1.527464747428894,
+      "learning_rate": 4.213669080676418e-05,
+      "loss": 0.6512,
+      "step": 9052
+    },
+    {
+      "epoch": 0.13115866689770692,
+      "grad_norm": 1.1075704097747803,
+      "learning_rate": 4.2075446369556056e-05,
+      "loss": 0.6577,
+      "step": 9083
+    },
+    {
+      "epoch": 0.1316063073990643,
+      "grad_norm": 0.9589399099349976,
+      "learning_rate": 4.201400923825648e-05,
+      "loss": 0.642,
+      "step": 9114
+    },
+    {
+      "epoch": 0.13205394790042166,
+      "grad_norm": 1.186531901359558,
+      "learning_rate": 4.195238010617511e-05,
+      "loss": 0.6553,
+      "step": 9145
+    },
+    {
+      "epoch": 0.132501588401779,
+      "grad_norm": 1.1176280975341797,
+      "learning_rate": 4.1890559668788344e-05,
+      "loss": 0.6483,
+      "step": 9176
+    },
+    {
+      "epoch": 0.13294922890313637,
+      "grad_norm": 1.4222681522369385,
+      "learning_rate": 4.1828548623731405e-05,
+      "loss": 0.6462,
+      "step": 9207
+    },
+    {
+      "epoch": 0.13339686940449375,
+      "grad_norm": 1.1606040000915527,
+      "learning_rate": 4.1766347670790506e-05,
+      "loss": 0.6514,
+      "step": 9238
+    },
+    {
+      "epoch": 0.1338445099058511,
+      "grad_norm": 1.313774585723877,
+      "learning_rate": 4.170395751189495e-05,
+      "loss": 0.6422,
+      "step": 9269
+    },
+    {
+      "epoch": 0.13429215040720846,
+      "grad_norm": 1.1994171142578125,
+      "learning_rate": 4.164137885110921e-05,
+      "loss": 0.6444,
+      "step": 9300
+    },
+    {
+      "epoch": 0.1347397909085658,
+      "grad_norm": 0.9376353025436401,
+      "learning_rate": 4.157861239462495e-05,
+      "loss": 0.6436,
+      "step": 9331
+    },
+    {
+      "epoch": 0.13518743140992318,
+      "grad_norm": 1.0350178480148315,
+      "learning_rate": 4.1515658850753114e-05,
+      "loss": 0.6447,
+      "step": 9362
+    },
+    {
+      "epoch": 0.13563507191128055,
+      "grad_norm": 1.3630082607269287,
+      "learning_rate": 4.145251892991588e-05,
+      "loss": 0.6427,
+      "step": 9393
+    },
+    {
+      "epoch": 0.1360827124126379,
+      "grad_norm": 1.0362364053726196,
+      "learning_rate": 4.138919334463868e-05,
+      "loss": 0.6443,
+      "step": 9424
+    },
+    {
+      "epoch": 0.13653035291399526,
+      "grad_norm": 1.1442211866378784,
+      "learning_rate": 4.1325682809542124e-05,
+      "loss": 0.6523,
+      "step": 9455
+    },
+    {
+      "epoch": 0.13697799341535263,
+      "grad_norm": 1.4196938276290894,
+      "learning_rate": 4.126198804133398e-05,
+      "loss": 0.6501,
+      "step": 9486
+    },
+    {
+      "epoch": 0.13742563391670998,
+      "grad_norm": 1.3853130340576172,
+      "learning_rate": 4.1198109758801055e-05,
+      "loss": 0.6431,
+      "step": 9517
+    },
+    {
+      "epoch": 0.13787327441806735,
+      "grad_norm": 1.0350273847579956,
+      "learning_rate": 4.113404868280107e-05,
+      "loss": 0.6436,
+      "step": 9548
+    },
+    {
+      "epoch": 0.13832091491942472,
+      "grad_norm": 1.0520857572555542,
+      "learning_rate": 4.106980553625457e-05,
+      "loss": 0.6436,
+      "step": 9579
+    },
+    {
+      "epoch": 0.13876855542078206,
+      "grad_norm": 1.127038836479187,
+      "learning_rate": 4.100538104413674e-05,
+      "loss": 0.639,
+      "step": 9610
+    },
+    {
+      "epoch": 0.13921619592213944,
+      "grad_norm": 1.1070880889892578,
+      "learning_rate": 4.09407759334692e-05,
+      "loss": 0.6366,
+      "step": 9641
+    },
+    {
+      "epoch": 0.1396638364234968,
+      "grad_norm": 1.3045605421066284,
+      "learning_rate": 4.087599093331186e-05,
+      "loss": 0.6496,
+      "step": 9672
+    },
+    {
+      "epoch": 0.14011147692485415,
+      "grad_norm": 1.234647512435913,
+      "learning_rate": 4.081102677475462e-05,
+      "loss": 0.6383,
+      "step": 9703
+    },
+    {
+      "epoch": 0.14055911742621152,
+      "grad_norm": 1.1154453754425049,
+      "learning_rate": 4.0745884190909194e-05,
+      "loss": 0.6454,
+      "step": 9734
+    },
+    {
+      "epoch": 0.14100675792756887,
+      "grad_norm": 1.2422186136245728,
+      "learning_rate": 4.0680563916900796e-05,
+      "loss": 0.6404,
+      "step": 9765
+    },
+    {
+      "epoch": 0.14145439842892624,
+      "grad_norm": 1.2128278017044067,
+      "learning_rate": 4.0615066689859815e-05,
+      "loss": 0.6376,
+      "step": 9796
+    },
+    {
+      "epoch": 0.1419020389302836,
+      "grad_norm": 1.3140804767608643,
+      "learning_rate": 4.0549393248913584e-05,
+      "loss": 0.6316,
+      "step": 9827
+    },
+    {
+      "epoch": 0.14234967943164095,
+      "grad_norm": 1.9198187589645386,
+      "learning_rate": 4.048354433517794e-05,
+      "loss": 0.6383,
+      "step": 9858
+    },
+    {
+      "epoch": 0.14279731993299832,
+      "grad_norm": 1.144679307937622,
+      "learning_rate": 4.0417520691748916e-05,
+      "loss": 0.6383,
+      "step": 9889
+    },
+    {
+      "epoch": 0.1432449604343557,
+      "grad_norm": 1.1679338216781616,
+      "learning_rate": 4.035132306369438e-05,
+      "loss": 0.6414,
+      "step": 9920
+    },
+    {
+      "epoch": 0.14369260093571304,
+      "grad_norm": 0.9563717246055603,
+      "learning_rate": 4.028495219804555e-05,
+      "loss": 0.6327,
+      "step": 9951
+    },
+    {
+      "epoch": 0.1441402414370704,
+      "grad_norm": 1.277036428451538,
+      "learning_rate": 4.021840884378864e-05,
+      "loss": 0.6365,
+      "step": 9982
+    },
+    {
+      "epoch": 0.14458788193842778,
+      "grad_norm": 0.9835182428359985,
+      "learning_rate": 4.015169375185633e-05,
+      "loss": 0.638,
+      "step": 10013
+    },
+    {
+      "epoch": 0.14503552243978513,
+      "grad_norm": 1.090118646621704,
+      "learning_rate": 4.0084807675119396e-05,
+      "loss": 0.6437,
+      "step": 10044
+    },
+    {
+      "epoch": 0.1454831629411425,
+      "grad_norm": 1.1823488473892212,
+      "learning_rate": 4.0017751368378106e-05,
+      "loss": 0.6326,
+      "step": 10075
+    },
+    {
+      "epoch": 0.14593080344249987,
+      "grad_norm": 1.070318341255188,
+      "learning_rate": 3.995052558835377e-05,
+      "loss": 0.6362,
+      "step": 10106
+    },
+    {
+      "epoch": 0.1463784439438572,
+      "grad_norm": 1.2451491355895996,
+      "learning_rate": 3.988313109368017e-05,
+      "loss": 0.6388,
+      "step": 10137
+    },
+    {
+      "epoch": 0.14682608444521458,
+      "grad_norm": 1.2417365312576294,
+      "learning_rate": 3.981556864489504e-05,
+      "loss": 0.6309,
+      "step": 10168
+    },
+    {
+      "epoch": 0.14727372494657193,
+      "grad_norm": 1.251518726348877,
+      "learning_rate": 3.974783900443142e-05,
+      "loss": 0.6365,
+      "step": 10199
+    },
+    {
+      "epoch": 0.1477213654479293,
+      "grad_norm": 1.359750747680664,
+      "learning_rate": 3.9679942936609095e-05,
+      "loss": 0.6386,
+      "step": 10230
+    },
+    {
+      "epoch": 0.14816900594928667,
+      "grad_norm": 1.1073262691497803,
+      "learning_rate": 3.961188120762596e-05,
+      "loss": 0.635,
+      "step": 10261
+    },
+    {
+      "epoch": 0.148616646450644,
+      "grad_norm": 0.9850608706474304,
+      "learning_rate": 3.954365458554938e-05,
+      "loss": 0.6389,
+      "step": 10292
+    },
+    {
+      "epoch": 0.14906428695200138,
+      "grad_norm": 1.2984429597854614,
+      "learning_rate": 3.947526384030751e-05,
+      "loss": 0.6317,
+      "step": 10323
+    },
+    {
+      "epoch": 0.14951192745335876,
+      "grad_norm": 1.1622575521469116,
+      "learning_rate": 3.9406709743680624e-05,
+      "loss": 0.6393,
+      "step": 10354
+    },
+    {
+      "epoch": 0.1499595679547161,
+      "grad_norm": 1.0856871604919434,
+      "learning_rate": 3.9337993069292366e-05,
+      "loss": 0.6351,
+      "step": 10385
+    },
+    {
+      "epoch": 0.15040720845607347,
+      "grad_norm": 1.0153882503509521,
+      "learning_rate": 3.926911459260109e-05,
+      "loss": 0.6282,
+      "step": 10416
+    },
+    {
+      "epoch": 0.15085484895743084,
+      "grad_norm": 1.2039254903793335,
+      "learning_rate": 3.920007509089102e-05,
+      "loss": 0.6365,
+      "step": 10447
+    },
+    {
+      "epoch": 0.1513024894587882,
+      "grad_norm": 1.1179555654525757,
+      "learning_rate": 3.913087534326357e-05,
+      "loss": 0.6311,
+      "step": 10478
+    },
+    {
+      "epoch": 0.15175012996014556,
+      "grad_norm": 1.090903639793396,
+      "learning_rate": 3.9061516130628475e-05,
+      "loss": 0.6401,
+      "step": 10509
+    },
+    {
+      "epoch": 0.15219777046150293,
+      "grad_norm": 0.9228240251541138,
+      "learning_rate": 3.8991998235695025e-05,
+      "loss": 0.6323,
+      "step": 10540
+    },
+    {
+      "epoch": 0.15264541096286027,
+      "grad_norm": 1.0772743225097656,
+      "learning_rate": 3.8922322442963224e-05,
+      "loss": 0.637,
+      "step": 10571
+    },
+    {
+      "epoch": 0.15309305146421764,
+      "grad_norm": 1.0854771137237549,
+      "learning_rate": 3.885248953871491e-05,
+      "loss": 0.6375,
+      "step": 10602
+    },
+    {
+      "epoch": 0.153540691965575,
+      "grad_norm": 1.3902987241744995,
+      "learning_rate": 3.8782500311004915e-05,
+      "loss": 0.6406,
+      "step": 10633
+    },
+    {
+      "epoch": 0.15398833246693236,
+      "grad_norm": 1.180351734161377,
+      "learning_rate": 3.871235554965218e-05,
+      "loss": 0.626,
+      "step": 10664
+    },
+    {
+      "epoch": 0.15443597296828973,
+      "grad_norm": 1.1136449575424194,
+      "learning_rate": 3.864205604623078e-05,
+      "loss": 0.6395,
+      "step": 10695
+    },
+    {
+      "epoch": 0.15488361346964707,
+      "grad_norm": 1.1770708560943604,
+      "learning_rate": 3.857160259406107e-05,
+      "loss": 0.633,
+      "step": 10726
+    },
+    {
+      "epoch": 0.15533125397100445,
+      "grad_norm": 1.1615066528320312,
+      "learning_rate": 3.8500995988200674e-05,
+      "loss": 0.632,
+      "step": 10757
+    },
+    {
+      "epoch": 0.15577889447236182,
+      "grad_norm": 1.2898380756378174,
+      "learning_rate": 3.843023702543556e-05,
+      "loss": 0.6332,
+      "step": 10788
+    },
+    {
+      "epoch": 0.15622653497371916,
+      "grad_norm": 1.0051672458648682,
+      "learning_rate": 3.8359326504270984e-05,
+      "loss": 0.6353,
+      "step": 10819
+    },
+    {
+      "epoch": 0.15667417547507653,
+      "grad_norm": 0.9514272212982178,
+      "learning_rate": 3.828826522492255e-05,
+      "loss": 0.6383,
+      "step": 10850
+    },
+    {
+      "epoch": 0.1571218159764339,
+      "grad_norm": 1.2570873498916626,
+      "learning_rate": 3.821705398930713e-05,
+      "loss": 0.6308,
+      "step": 10881
+    },
+    {
+      "epoch": 0.15756945647779125,
+      "grad_norm": 0.992323637008667,
+      "learning_rate": 3.814569360103385e-05,
+      "loss": 0.6303,
+      "step": 10912
+    },
+    {
+      "epoch": 0.15801709697914862,
+      "grad_norm": 1.255265474319458,
+      "learning_rate": 3.807418486539499e-05,
+      "loss": 0.6349,
+      "step": 10943
+    },
+    {
+      "epoch": 0.158464737480506,
+      "grad_norm": 1.1066702604293823,
+      "learning_rate": 3.80025285893569e-05,
+      "loss": 0.6317,
+      "step": 10974
+    },
+    {
+      "epoch": 0.15891237798186333,
+      "grad_norm": 1.178690791130066,
+      "learning_rate": 3.793072558155093e-05,
+      "loss": 0.639,
+      "step": 11005
+    },
+    {
+      "epoch": 0.1593600184832207,
+      "grad_norm": 1.0850341320037842,
+      "learning_rate": 3.785877665226426e-05,
+      "loss": 0.6375,
+      "step": 11036
+    },
+    {
+      "epoch": 0.15980765898457805,
+      "grad_norm": 1.1378651857376099,
+      "learning_rate": 3.778668261343079e-05,
+      "loss": 0.6287,
+      "step": 11067
+    },
+    {
+      "epoch": 0.16025529948593542,
+      "grad_norm": 1.07688570022583,
+      "learning_rate": 3.771444427862192e-05,
+      "loss": 0.6261,
+      "step": 11098
+    },
+    {
+      "epoch": 0.1607029399872928,
+      "grad_norm": 1.108269453048706,
+      "learning_rate": 3.7642062463037465e-05,
+      "loss": 0.6352,
+      "step": 11129
+    },
+    {
+      "epoch": 0.16115058048865014,
+      "grad_norm": 1.2582095861434937,
+      "learning_rate": 3.7569537983496373e-05,
+      "loss": 0.6312,
+      "step": 11160
+    },
+    {
+      "epoch": 0.1615982209900075,
+      "grad_norm": 0.9823578000068665,
+      "learning_rate": 3.749687165842753e-05,
+      "loss": 0.6253,
+      "step": 11191
+    },
+    {
+      "epoch": 0.16204586149136488,
+      "grad_norm": 1.3922805786132812,
+      "learning_rate": 3.7424064307860536e-05,
+      "loss": 0.6279,
+      "step": 11222
+    },
+    {
+      "epoch": 0.16249350199272222,
+      "grad_norm": 1.2210962772369385,
+      "learning_rate": 3.735111675341645e-05,
+      "loss": 0.6357,
+      "step": 11253
+    },
+    {
+      "epoch": 0.1629411424940796,
+      "grad_norm": 1.0463316440582275,
+      "learning_rate": 3.7278029818298524e-05,
+      "loss": 0.6332,
+      "step": 11284
+    },
+    {
+      "epoch": 0.16338878299543697,
+      "grad_norm": 1.165583848953247,
+      "learning_rate": 3.720480432728287e-05,
+      "loss": 0.627,
+      "step": 11315
+    },
+    {
+      "epoch": 0.1638364234967943,
+      "grad_norm": 1.0995306968688965,
+      "learning_rate": 3.71314411067092e-05,
+      "loss": 0.6283,
+      "step": 11346
+    },
+    {
+      "epoch": 0.16428406399815168,
+      "grad_norm": 1.0279158353805542,
+      "learning_rate": 3.70579409844715e-05,
+      "loss": 0.6287,
+      "step": 11377
+    },
+    {
+      "epoch": 0.16473170449950905,
+      "grad_norm": 1.51092529296875,
+      "learning_rate": 3.698430479000865e-05,
+      "loss": 0.6261,
+      "step": 11408
+    },
+    {
+      "epoch": 0.1651793450008664,
+      "grad_norm": 1.020936369895935,
+      "learning_rate": 3.691053335429509e-05,
+      "loss": 0.6327,
+      "step": 11439
+    },
+    {
+      "epoch": 0.16562698550222377,
+      "grad_norm": 1.0198683738708496,
+      "learning_rate": 3.683662750983147e-05,
+      "loss": 0.6422,
+      "step": 11470
+    },
+    {
+      "epoch": 0.1660746260035811,
+      "grad_norm": 1.2650995254516602,
+      "learning_rate": 3.676258809063518e-05,
+      "loss": 0.6354,
+      "step": 11501
+    },
+    {
+      "epoch": 0.16652226650493848,
+      "grad_norm": 1.1653568744659424,
+      "learning_rate": 3.6688415932231004e-05,
+      "loss": 0.6325,
+      "step": 11532
+    },
+    {
+      "epoch": 0.16696990700629585,
+      "grad_norm": 1.1461430788040161,
+      "learning_rate": 3.661411187164166e-05,
+      "loss": 0.6251,
+      "step": 11563
+    },
+    {
+      "epoch": 0.1674175475076532,
+      "grad_norm": 1.2535974979400635,
+      "learning_rate": 3.65396767473784e-05,
+      "loss": 0.6245,
+      "step": 11594
+    },
+    {
+      "epoch": 0.16786518800901057,
+      "grad_norm": 1.115191102027893,
+      "learning_rate": 3.6465111399431465e-05,
+      "loss": 0.6294,
+      "step": 11625
+    },
+    {
+      "epoch": 0.16831282851036794,
+      "grad_norm": 1.0482964515686035,
+      "learning_rate": 3.6390416669260674e-05,
+      "loss": 0.6247,
+      "step": 11656
+    },
+    {
+      "epoch": 0.16876046901172528,
+      "grad_norm": 1.1431951522827148,
+      "learning_rate": 3.63155933997859e-05,
+      "loss": 0.63,
+      "step": 11687
+    },
+    {
+      "epoch": 0.16920810951308266,
+      "grad_norm": 1.0254175662994385,
+      "learning_rate": 3.624064243537758e-05,
+      "loss": 0.6212,
+      "step": 11718
+    },
+    {
+      "epoch": 0.16965575001444003,
+      "grad_norm": 0.9481080174446106,
+      "learning_rate": 3.616556462184716e-05,
+      "loss": 0.6278,
+      "step": 11749
+    },
+    {
+      "epoch": 0.17010339051579737,
+      "grad_norm": 1.118394374847412,
+      "learning_rate": 3.609036080643755e-05,
+      "loss": 0.6244,
+      "step": 11780
+    },
+    {
+      "epoch": 0.17055103101715474,
+      "grad_norm": 1.1592167615890503,
+      "learning_rate": 3.60150318378136e-05,
+      "loss": 0.621,
+      "step": 11811
+    },
+    {
+      "epoch": 0.1709986715185121,
+      "grad_norm": 0.9984686374664307,
+      "learning_rate": 3.5939578566052465e-05,
+      "loss": 0.6319,
+      "step": 11842
+    },
+    {
+      "epoch": 0.17144631201986946,
+      "grad_norm": 1.0091164112091064,
+      "learning_rate": 3.586400184263408e-05,
+      "loss": 0.6345,
+      "step": 11873
+    },
+    {
+      "epoch": 0.17189395252122683,
+      "grad_norm": 1.0355888605117798,
+      "learning_rate": 3.578830252043148e-05,
+      "loss": 0.6171,
+      "step": 11904
+    },
+    {
+      "epoch": 0.17234159302258417,
+      "grad_norm": 1.1437592506408691,
+      "learning_rate": 3.571248145370125e-05,
+      "loss": 0.6201,
+      "step": 11935
+    },
+    {
+      "epoch": 0.17278923352394154,
+      "grad_norm": 0.9440962672233582,
+      "learning_rate": 3.5636539498073794e-05,
+      "loss": 0.6236,
+      "step": 11966
+    },
+    {
+      "epoch": 0.17323687402529891,
+      "grad_norm": 0.9761082530021667,
+      "learning_rate": 3.556047751054378e-05,
+      "loss": 0.6291,
+      "step": 11997
+    },
+    {
+      "epoch": 0.17368451452665626,
+      "grad_norm": 1.1858127117156982,
+      "learning_rate": 3.548429634946039e-05,
+      "loss": 0.6299,
+      "step": 12028
+    },
+    {
+      "epoch": 0.17413215502801363,
+      "grad_norm": 1.0180195569992065,
+      "learning_rate": 3.540799687451768e-05,
+      "loss": 0.6227,
+      "step": 12059
+    },
+    {
+      "epoch": 0.174579795529371,
+      "grad_norm": 0.9683852195739746,
+      "learning_rate": 3.533157994674485e-05,
+      "loss": 0.626,
+      "step": 12090
+    },
+    {
+      "epoch": 0.17502743603072834,
+      "grad_norm": 1.0338289737701416,
+      "learning_rate": 3.5255046428496546e-05,
+      "loss": 0.6377,
+      "step": 12121
+    },
+    {
+      "epoch": 0.17547507653208572,
+      "grad_norm": 1.1238298416137695,
+      "learning_rate": 3.517839718344311e-05,
+      "loss": 0.6338,
+      "step": 12152
+    },
+    {
+      "epoch": 0.1759227170334431,
+      "grad_norm": 1.0541973114013672,
+      "learning_rate": 3.510163307656086e-05,
+      "loss": 0.6222,
+      "step": 12183
+    },
+    {
+      "epoch": 0.17637035753480043,
+      "grad_norm": 1.1677592992782593,
+      "learning_rate": 3.5024754974122324e-05,
+      "loss": 0.6256,
+      "step": 12214
+    },
+    {
+      "epoch": 0.1768179980361578,
+      "grad_norm": 1.0191985368728638,
+      "learning_rate": 3.494776374368643e-05,
+      "loss": 0.6301,
+      "step": 12245
+    },
+    {
+      "epoch": 0.17726563853751517,
+      "grad_norm": 1.1535918712615967,
+      "learning_rate": 3.4870660254088724e-05,
+      "loss": 0.6253,
+      "step": 12276
+    },
+    {
+      "epoch": 0.17771327903887252,
+      "grad_norm": 1.0887985229492188,
+      "learning_rate": 3.479344537543164e-05,
+      "loss": 0.6335,
+      "step": 12307
+    },
+    {
+      "epoch": 0.1781609195402299,
+      "grad_norm": 1.010688066482544,
+      "learning_rate": 3.4716119979074565e-05,
+      "loss": 0.6251,
+      "step": 12338
+    },
+    {
+      "epoch": 0.17860856004158723,
+      "grad_norm": 0.9745127558708191,
+      "learning_rate": 3.463868493762412e-05,
+      "loss": 0.6241,
+      "step": 12369
+    },
+    {
+      "epoch": 0.1790562005429446,
+      "grad_norm": 1.0414716005325317,
+      "learning_rate": 3.456114112492418e-05,
+      "loss": 0.6237,
+      "step": 12400
+    },
+    {
+      "epoch": 0.17950384104430198,
+      "grad_norm": 1.0457465648651123,
+      "learning_rate": 3.4483489416046164e-05,
+      "loss": 0.6258,
+      "step": 12431
+    },
+    {
+      "epoch": 0.17995148154565932,
+      "grad_norm": 1.0389049053192139,
+      "learning_rate": 3.440573068727905e-05,
+      "loss": 0.6262,
+      "step": 12462
+    },
+    {
+      "epoch": 0.1803991220470167,
+      "grad_norm": 1.255600094795227,
+      "learning_rate": 3.4327865816119495e-05,
+      "loss": 0.6305,
+      "step": 12493
+    },
+    {
+      "epoch": 0.18084676254837406,
+      "grad_norm": 1.0340358018875122,
+      "learning_rate": 3.4249895681262025e-05,
+      "loss": 0.6212,
+      "step": 12524
+    },
+    {
+      "epoch": 0.1812944030497314,
+      "grad_norm": 1.0317034721374512,
+      "learning_rate": 3.417182116258899e-05,
+      "loss": 0.6279,
+      "step": 12555
+    },
+    {
+      "epoch": 0.18174204355108878,
+      "grad_norm": 1.1320221424102783,
+      "learning_rate": 3.409364314116074e-05,
+      "loss": 0.631,
+      "step": 12586
+    },
+    {
+      "epoch": 0.18218968405244615,
+      "grad_norm": 0.9674787521362305,
+      "learning_rate": 3.401536249920559e-05,
+      "loss": 0.627,
+      "step": 12617
+    },
+    {
+      "epoch": 0.1826373245538035,
+      "grad_norm": 0.9329623579978943,
+      "learning_rate": 3.393698012010998e-05,
+      "loss": 0.6244,
+      "step": 12648
+    },
+    {
+      "epoch": 0.18308496505516086,
+      "grad_norm": 1.2081501483917236,
+      "learning_rate": 3.385849688840839e-05,
+      "loss": 0.6295,
+      "step": 12679
+    },
+    {
+      "epoch": 0.18353260555651824,
+      "grad_norm": 0.9842090010643005,
+      "learning_rate": 3.3779913689773414e-05,
+      "loss": 0.6276,
+      "step": 12710
+    },
+    {
+      "epoch": 0.18398024605787558,
+      "grad_norm": 1.1417752504348755,
+      "learning_rate": 3.370123141100578e-05,
+      "loss": 0.6266,
+      "step": 12741
+    },
+    {
+      "epoch": 0.18442788655923295,
+      "grad_norm": 0.9693592190742493,
+      "learning_rate": 3.3622450940024305e-05,
+      "loss": 0.6245,
+      "step": 12772
+    },
+    {
+      "epoch": 0.1848755270605903,
+      "grad_norm": 1.1920111179351807,
+      "learning_rate": 3.35435731658559e-05,
+      "loss": 0.6227,
+      "step": 12803
+    },
+    {
+      "epoch": 0.18532316756194767,
+      "grad_norm": 0.9865401387214661,
+      "learning_rate": 3.346459897862552e-05,
+      "loss": 0.6287,
+      "step": 12834
+    },
+    {
+      "epoch": 0.18577080806330504,
+      "grad_norm": 0.9544184803962708,
+      "learning_rate": 3.338552926954613e-05,
+      "loss": 0.6236,
+      "step": 12865
+    },
+    {
+      "epoch": 0.18621844856466238,
+      "grad_norm": 1.0202548503875732,
+      "learning_rate": 3.330636493090868e-05,
+      "loss": 0.6269,
+      "step": 12896
+    },
+    {
+      "epoch": 0.18666608906601975,
+      "grad_norm": 1.1385433673858643,
+      "learning_rate": 3.322710685607193e-05,
+      "loss": 0.6385,
+      "step": 12927
+    },
+    {
+      "epoch": 0.18711372956737712,
+      "grad_norm": 1.0102901458740234,
+      "learning_rate": 3.314775593945251e-05,
+      "loss": 0.6241,
+      "step": 12958
+    },
+    {
+      "epoch": 0.18756137006873447,
+      "grad_norm": 0.9830989241600037,
+      "learning_rate": 3.3068313076514714e-05,
+      "loss": 0.6243,
+      "step": 12989
+    },
+    {
+      "epoch": 0.18800901057009184,
+      "grad_norm": 1.0044376850128174,
+      "learning_rate": 3.298877916376047e-05,
+      "loss": 0.619,
+      "step": 13020
+    },
+    {
+      "epoch": 0.1884566510714492,
+      "grad_norm": 1.0714712142944336,
+      "learning_rate": 3.290915509871915e-05,
+      "loss": 0.6243,
+      "step": 13051
+    },
+    {
+      "epoch": 0.18890429157280655,
+      "grad_norm": 0.9379229545593262,
+      "learning_rate": 3.282944177993753e-05,
+      "loss": 0.6216,
+      "step": 13082
+    },
+    {
+      "epoch": 0.18935193207416393,
+      "grad_norm": 1.2717514038085938,
+      "learning_rate": 3.274964010696957e-05,
+      "loss": 0.6206,
+      "step": 13113
+    },
+    {
+      "epoch": 0.1897995725755213,
+      "grad_norm": 1.1147576570510864,
+      "learning_rate": 3.266975098036629e-05,
+      "loss": 0.6234,
+      "step": 13144
+    },
+    {
+      "epoch": 0.19024721307687864,
+      "grad_norm": 0.9994730949401855,
+      "learning_rate": 3.258977530166562e-05,
+      "loss": 0.6146,
+      "step": 13175
+    },
+    {
+      "epoch": 0.190694853578236,
+      "grad_norm": 1.195367693901062,
+      "learning_rate": 3.250971397338227e-05,
+      "loss": 0.624,
+      "step": 13206
+    },
+    {
+      "epoch": 0.19114249407959336,
+      "grad_norm": 1.0008747577667236,
+      "learning_rate": 3.2429567898997404e-05,
+      "loss": 0.6182,
+      "step": 13237
+    },
+    {
+      "epoch": 0.19159013458095073,
+      "grad_norm": 1.3223299980163574,
+      "learning_rate": 3.234933798294859e-05,
+      "loss": 0.6193,
+      "step": 13268
+    },
+    {
+      "epoch": 0.1920377750823081,
+      "grad_norm": 1.1946437358856201,
+      "learning_rate": 3.2269025130619535e-05,
+      "loss": 0.6201,
+      "step": 13299
+    },
+    {
+      "epoch": 0.19248541558366544,
+      "grad_norm": 1.1597986221313477,
+      "learning_rate": 3.218863024832985e-05,
+      "loss": 0.6212,
+      "step": 13330
+    },
+    {
+      "epoch": 0.1929330560850228,
+      "grad_norm": 0.9518936276435852,
+      "learning_rate": 3.2108154243324864e-05,
+      "loss": 0.6154,
+      "step": 13361
+    },
+    {
+      "epoch": 0.19338069658638019,
+      "grad_norm": 0.890487790107727,
+      "learning_rate": 3.2027598023765345e-05,
+      "loss": 0.6203,
+      "step": 13392
+    },
+    {
+      "epoch": 0.19382833708773753,
+      "grad_norm": 0.9918534755706787,
+      "learning_rate": 3.194696249871729e-05,
+      "loss": 0.6319,
+      "step": 13423
+    },
+    {
+      "epoch": 0.1942759775890949,
+      "grad_norm": 1.1954073905944824,
+      "learning_rate": 3.186624857814164e-05,
+      "loss": 0.619,
+      "step": 13454
+    },
+    {
+      "epoch": 0.19472361809045227,
+      "grad_norm": 1.1521157026290894,
+      "learning_rate": 3.178545717288401e-05,
+      "loss": 0.6326,
+      "step": 13485
+    },
+    {
+      "epoch": 0.19517125859180962,
+      "grad_norm": 1.0131208896636963,
+      "learning_rate": 3.170458919466444e-05,
+      "loss": 0.6234,
+      "step": 13516
+    },
+    {
+      "epoch": 0.195618899093167,
+      "grad_norm": 1.0429494380950928,
+      "learning_rate": 3.1623645556067063e-05,
+      "loss": 0.6146,
+      "step": 13547
+    },
+    {
+      "epoch": 0.19606653959452436,
+      "grad_norm": 0.9586461782455444,
+      "learning_rate": 3.154262717052985e-05,
+      "loss": 0.6192,
+      "step": 13578
+    },
+    {
+      "epoch": 0.1965141800958817,
+      "grad_norm": 0.9385515451431274,
+      "learning_rate": 3.146153495233426e-05,
+      "loss": 0.6186,
+      "step": 13609
+    },
+    {
+      "epoch": 0.19696182059723907,
+      "grad_norm": 0.9109722375869751,
+      "learning_rate": 3.1380369816594944e-05,
+      "loss": 0.6223,
+      "step": 13640
+    },
+    {
+      "epoch": 0.19740946109859642,
+      "grad_norm": 1.0564444065093994,
+      "learning_rate": 3.129913267924946e-05,
+      "loss": 0.6235,
+      "step": 13671
+    },
+    {
+      "epoch": 0.1978571015999538,
+      "grad_norm": 1.1656286716461182,
+      "learning_rate": 3.121782445704782e-05,
+      "loss": 0.6176,
+      "step": 13702
+    },
+    {
+      "epoch": 0.19830474210131116,
+      "grad_norm": 1.1301069259643555,
+      "learning_rate": 3.11364460675423e-05,
+      "loss": 0.6253,
+      "step": 13733
+    },
+    {
+      "epoch": 0.1987523826026685,
+      "grad_norm": 0.9939395785331726,
+      "learning_rate": 3.1054998429076934e-05,
+      "loss": 0.6223,
+      "step": 13764
+    },
+    {
+      "epoch": 0.19920002310402588,
+      "grad_norm": 1.2881885766983032,
+      "learning_rate": 3.097348246077728e-05,
+      "loss": 0.6177,
+      "step": 13795
+    },
+    {
+      "epoch": 0.19964766360538325,
+      "grad_norm": 1.1002579927444458,
+      "learning_rate": 3.0891899082539924e-05,
+      "loss": 0.6139,
+      "step": 13826
+    },
+    {
+      "epoch": 0.2000953041067406,
+      "grad_norm": 1.045394778251648,
+      "learning_rate": 3.0810249215022233e-05,
+      "loss": 0.6192,
+      "step": 13857
+    },
+    {
+      "epoch": 0.20054294460809796,
+      "grad_norm": 0.9559116959571838,
+      "learning_rate": 3.0728533779631865e-05,
+      "loss": 0.6155,
+      "step": 13888
+    },
+    {
+      "epoch": 0.20099058510945533,
+      "grad_norm": 0.9250887036323547,
+      "learning_rate": 3.064675369851637e-05,
+      "loss": 0.6235,
+      "step": 13919
+    },
+    {
+      "epoch": 0.20143822561081268,
+      "grad_norm": 1.0655368566513062,
+      "learning_rate": 3.056490989455289e-05,
+      "loss": 0.628,
+      "step": 13950
+    },
+    {
+      "epoch": 0.20188586611217005,
+      "grad_norm": 1.07636559009552,
+      "learning_rate": 3.0483003291337596e-05,
+      "loss": 0.6244,
+      "step": 13981
+    },
+    {
+      "epoch": 0.20233350661352742,
+      "grad_norm": 1.050580620765686,
+      "learning_rate": 3.040103481317539e-05,
+      "loss": 0.6222,
+      "step": 14012
+    },
+    {
+      "epoch": 0.20278114711488476,
+      "grad_norm": 1.3754404783248901,
+      "learning_rate": 3.03190053850694e-05,
+      "loss": 0.6151,
+      "step": 14043
+    },
+    {
+      "epoch": 0.20322878761624213,
+      "grad_norm": 1.0527547597885132,
+      "learning_rate": 3.0236915932710573e-05,
+      "loss": 0.6153,
+      "step": 14074
+    },
+    {
+      "epoch": 0.20367642811759948,
+      "grad_norm": 0.9438226819038391,
+      "learning_rate": 3.0154767382467232e-05,
+      "loss": 0.618,
+      "step": 14105
+    },
+    {
+      "epoch": 0.20412406861895685,
+      "grad_norm": 1.0383126735687256,
+      "learning_rate": 3.0072560661374582e-05,
+      "loss": 0.6162,
+      "step": 14136
+    },
+    {
+      "epoch": 0.20457170912031422,
+      "grad_norm": 1.1412239074707031,
+      "learning_rate": 2.999029669712431e-05,
+      "loss": 0.6284,
+      "step": 14167
+    },
+    {
+      "epoch": 0.20501934962167156,
+      "grad_norm": 1.1064159870147705,
+      "learning_rate": 2.990797641805408e-05,
+      "loss": 0.6223,
+      "step": 14198
+    },
+    {
+      "epoch": 0.20546699012302894,
+      "grad_norm": 1.0044069290161133,
+      "learning_rate": 2.982560075313704e-05,
+      "loss": 0.6191,
+      "step": 14229
+    },
+    {
+      "epoch": 0.2059146306243863,
+      "grad_norm": 0.9315604567527771,
+      "learning_rate": 2.9743170631971368e-05,
+      "loss": 0.6207,
+      "step": 14260
+    },
+    {
+      "epoch": 0.20636227112574365,
+      "grad_norm": 0.941224217414856,
+      "learning_rate": 2.9660686984769792e-05,
+      "loss": 0.6207,
+      "step": 14291
+    },
+    {
+      "epoch": 0.20680991162710102,
+      "grad_norm": 1.1239089965820312,
+      "learning_rate": 2.9578150742349047e-05,
+      "loss": 0.6252,
+      "step": 14322
+    },
+    {
+      "epoch": 0.2072575521284584,
+      "grad_norm": 0.9484926462173462,
+      "learning_rate": 2.949556283611942e-05,
+      "loss": 0.6136,
+      "step": 14353
+    },
+    {
+      "epoch": 0.20770519262981574,
+      "grad_norm": 0.9437084197998047,
+      "learning_rate": 2.9412924198074206e-05,
+      "loss": 0.6154,
+      "step": 14384
+    },
+    {
+      "epoch": 0.2081528331311731,
+      "grad_norm": 0.9578093886375427,
+      "learning_rate": 2.9330235760779208e-05,
+      "loss": 0.6191,
+      "step": 14415
+    },
+    {
+      "epoch": 0.20860047363253048,
+      "grad_norm": 1.0657248497009277,
+      "learning_rate": 2.9247498457362188e-05,
+      "loss": 0.6178,
+      "step": 14446
+    },
+    {
+      "epoch": 0.20904811413388782,
+      "grad_norm": 0.853568434715271,
+      "learning_rate": 2.9164713221502373e-05,
+      "loss": 0.6152,
+      "step": 14477
+    },
+    {
+      "epoch": 0.2094957546352452,
+      "grad_norm": 1.0403015613555908,
+      "learning_rate": 2.9081880987419912e-05,
+      "loss": 0.6108,
+      "step": 14508
+    },
+    {
+      "epoch": 0.20994339513660254,
+      "grad_norm": 1.0344171524047852,
+      "learning_rate": 2.8999002689865296e-05,
+      "loss": 0.6155,
+      "step": 14539
+    },
+    {
+      "epoch": 0.2103910356379599,
+      "grad_norm": 1.0755060911178589,
+      "learning_rate": 2.8916079264108852e-05,
+      "loss": 0.6156,
+      "step": 14570
+    },
+    {
+      "epoch": 0.21083867613931728,
+      "grad_norm": 0.8636776208877563,
+      "learning_rate": 2.883311164593017e-05,
+      "loss": 0.6193,
+      "step": 14601
+    },
+    {
+      "epoch": 0.21128631664067463,
+      "grad_norm": 1.0264644622802734,
+      "learning_rate": 2.875010077160754e-05,
+      "loss": 0.6138,
+      "step": 14632
+    },
+    {
+      "epoch": 0.211733957142032,
+      "grad_norm": 1.2590196132659912,
+      "learning_rate": 2.866704757790741e-05,
+      "loss": 0.6202,
+      "step": 14663
+    },
+    {
+      "epoch": 0.21218159764338937,
+      "grad_norm": 1.1028645038604736,
+      "learning_rate": 2.858395300207376e-05,
+      "loss": 0.614,
+      "step": 14694
+    },
+    {
+      "epoch": 0.2126292381447467,
+      "grad_norm": 0.8904405236244202,
+      "learning_rate": 2.8500817981817607e-05,
+      "loss": 0.6152,
+      "step": 14725
+    },
+    {
+      "epoch": 0.21307687864610408,
+      "grad_norm": 0.9810163974761963,
+      "learning_rate": 2.8417643455306336e-05,
+      "loss": 0.6088,
+      "step": 14756
+    },
+    {
+      "epoch": 0.21352451914746146,
+      "grad_norm": 0.9837898015975952,
+      "learning_rate": 2.8334430361153185e-05,
+      "loss": 0.6129,
+      "step": 14787
+    },
+    {
+      "epoch": 0.2139721596488188,
+      "grad_norm": 0.987639844417572,
+      "learning_rate": 2.8251179638406612e-05,
+      "loss": 0.6081,
+      "step": 14818
+    },
+    {
+      "epoch": 0.21441980015017617,
+      "grad_norm": 1.1478586196899414,
+      "learning_rate": 2.8167892226539704e-05,
+      "loss": 0.6146,
+      "step": 14849
+    },
+    {
+      "epoch": 0.21486744065153354,
+      "grad_norm": 1.0885242223739624,
+      "learning_rate": 2.8084569065439588e-05,
+      "loss": 0.6183,
+      "step": 14880
+    },
+    {
+      "epoch": 0.21531508115289089,
+      "grad_norm": 0.9934699535369873,
+      "learning_rate": 2.8001211095396807e-05,
+      "loss": 0.6157,
+      "step": 14911
+    },
+    {
+      "epoch": 0.21576272165424826,
+      "grad_norm": 0.9285492300987244,
+      "learning_rate": 2.791781925709473e-05,
+      "loss": 0.6196,
+      "step": 14942
+    },
+    {
+      "epoch": 0.2162103621556056,
+      "grad_norm": 1.243133783340454,
+      "learning_rate": 2.7834394491598908e-05,
+      "loss": 0.6109,
+      "step": 14973
+    },
+    {
+      "epoch": 0.21665800265696297,
+      "grad_norm": 1.0712559223175049,
+      "learning_rate": 2.7750937740346485e-05,
+      "loss": 0.6268,
+      "step": 15004
+    },
+    {
+      "epoch": 0.21710564315832034,
+      "grad_norm": 1.0762903690338135,
+      "learning_rate": 2.7667449945135564e-05,
+      "loss": 0.6162,
+      "step": 15035
+    },
+    {
+      "epoch": 0.2175532836596777,
+      "grad_norm": 1.043479084968567,
+      "learning_rate": 2.7583932048114557e-05,
+      "loss": 0.6174,
+      "step": 15066
+    },
+    {
+      "epoch": 0.21800092416103506,
+      "grad_norm": 0.9906991720199585,
+      "learning_rate": 2.7500384991771587e-05,
+      "loss": 0.6153,
+      "step": 15097
+    },
+    {
+      "epoch": 0.21844856466239243,
+      "grad_norm": 0.8844815492630005,
+      "learning_rate": 2.7416809718923825e-05,
+      "loss": 0.6113,
+      "step": 15128
+    },
+    {
+      "epoch": 0.21889620516374977,
+      "grad_norm": 1.0258604288101196,
+      "learning_rate": 2.7333207172706864e-05,
+      "loss": 0.6111,
+      "step": 15159
+    },
+    {
+      "epoch": 0.21934384566510715,
+      "grad_norm": 0.8992047309875488,
+      "learning_rate": 2.7249578296564088e-05,
+      "loss": 0.6083,
+      "step": 15190
+    },
+    {
+      "epoch": 0.21979148616646452,
+      "grad_norm": 0.991061806678772,
+      "learning_rate": 2.7165924034235973e-05,
+      "loss": 0.6219,
+      "step": 15221
+    },
+    {
+      "epoch": 0.22023912666782186,
+      "grad_norm": 0.9700108766555786,
+      "learning_rate": 2.708224532974953e-05,
+      "loss": 0.6119,
+      "step": 15252
+    },
+    {
+      "epoch": 0.22068676716917923,
+      "grad_norm": 0.904680609703064,
+      "learning_rate": 2.6998543127407538e-05,
+      "loss": 0.6135,
+      "step": 15283
+    },
+    {
+      "epoch": 0.2211344076705366,
+      "grad_norm": 0.9015173316001892,
+      "learning_rate": 2.6914818371777988e-05,
+      "loss": 0.611,
+      "step": 15314
+    },
+    {
+      "epoch": 0.22158204817189395,
+      "grad_norm": 1.020070195198059,
+      "learning_rate": 2.6831072007683373e-05,
+      "loss": 0.617,
+      "step": 15345
+    },
+    {
+      "epoch": 0.22202968867325132,
+      "grad_norm": 1.0938650369644165,
+      "learning_rate": 2.6747304980190018e-05,
+      "loss": 0.6135,
+      "step": 15376
+    },
+    {
+      "epoch": 0.22247732917460866,
+      "grad_norm": 1.2179347276687622,
+      "learning_rate": 2.6663518234597453e-05,
+      "loss": 0.6108,
+      "step": 15407
+    },
+    {
+      "epoch": 0.22292496967596603,
+      "grad_norm": 0.9314635396003723,
+      "learning_rate": 2.6579712716427696e-05,
+      "loss": 0.6109,
+      "step": 15438
+    },
+    {
+      "epoch": 0.2233726101773234,
+      "grad_norm": 0.9413474798202515,
+      "learning_rate": 2.6495889371414652e-05,
+      "loss": 0.6114,
+      "step": 15469
+    },
+    {
+      "epoch": 0.22382025067868075,
+      "grad_norm": 1.0556674003601074,
+      "learning_rate": 2.6412049145493367e-05,
+      "loss": 0.6114,
+      "step": 15500
+    },
+    {
+      "epoch": 0.22426789118003812,
+      "grad_norm": 0.9029526114463806,
+      "learning_rate": 2.632819298478939e-05,
+      "loss": 0.6152,
+      "step": 15531
+    },
+    {
+      "epoch": 0.2247155316813955,
+      "grad_norm": 1.0554165840148926,
+      "learning_rate": 2.6244321835608105e-05,
+      "loss": 0.6077,
+      "step": 15562
+    },
+    {
+      "epoch": 0.22516317218275284,
+      "grad_norm": 0.9897674918174744,
+      "learning_rate": 2.6160436644424024e-05,
+      "loss": 0.6099,
+      "step": 15593
+    },
+    {
+      "epoch": 0.2256108126841102,
+      "grad_norm": 1.036055326461792,
+      "learning_rate": 2.6076538357870133e-05,
+      "loss": 0.6115,
+      "step": 15624
+    },
+    {
+      "epoch": 0.22605845318546758,
+      "grad_norm": 1.1050103902816772,
+      "learning_rate": 2.5992627922727196e-05,
+      "loss": 0.6132,
+      "step": 15655
+    },
+    {
+      "epoch": 0.22650609368682492,
+      "grad_norm": 1.0429555177688599,
+      "learning_rate": 2.5908706285913066e-05,
+      "loss": 0.6114,
+      "step": 15686
+    },
+    {
+      "epoch": 0.2269537341881823,
+      "grad_norm": 0.8952310681343079,
+      "learning_rate": 2.5824774394472008e-05,
+      "loss": 0.6155,
+      "step": 15717
+    },
+    {
+      "epoch": 0.22740137468953966,
+      "grad_norm": 0.9422932267189026,
+      "learning_rate": 2.5740833195563996e-05,
+      "loss": 0.6115,
+      "step": 15748
+    },
+    {
+      "epoch": 0.227849015190897,
+      "grad_norm": 0.8615415096282959,
+      "learning_rate": 2.5656883636454067e-05,
+      "loss": 0.6147,
+      "step": 15779
+    },
+    {
+      "epoch": 0.22829665569225438,
+      "grad_norm": 1.0953892469406128,
+      "learning_rate": 2.557292666450159e-05,
+      "loss": 0.6141,
+      "step": 15810
+    },
+    {
+      "epoch": 0.22874429619361172,
+      "grad_norm": 1.038050651550293,
+      "learning_rate": 2.5488963227149566e-05,
+      "loss": 0.6118,
+      "step": 15841
+    },
+    {
+      "epoch": 0.2291919366949691,
+      "grad_norm": 1.0005477666854858,
+      "learning_rate": 2.5404994271913983e-05,
+      "loss": 0.6071,
+      "step": 15872
+    },
+    {
+      "epoch": 0.22963957719632647,
+      "grad_norm": 1.1400154829025269,
+      "learning_rate": 2.5321020746373085e-05,
+      "loss": 0.6073,
+      "step": 15903
+    },
+    {
+      "epoch": 0.2300872176976838,
+      "grad_norm": 0.9465575218200684,
+      "learning_rate": 2.52370435981567e-05,
+      "loss": 0.6139,
+      "step": 15934
+    },
+    {
+      "epoch": 0.23053485819904118,
+      "grad_norm": 0.9524116516113281,
+      "learning_rate": 2.5153063774935533e-05,
+      "loss": 0.6112,
+      "step": 15965
+    },
+    {
+      "epoch": 0.23098249870039855,
+      "grad_norm": 1.0909959077835083,
+      "learning_rate": 2.506908222441045e-05,
+      "loss": 0.6062,
+      "step": 15996
+    },
+    {
+      "epoch": 0.2314301392017559,
+      "grad_norm": 0.9520925879478455,
+      "learning_rate": 2.498509989430187e-05,
+      "loss": 0.6066,
+      "step": 16027
+    },
+    {
+      "epoch": 0.23187777970311327,
+      "grad_norm": 0.9747080206871033,
+      "learning_rate": 2.4901117732338958e-05,
+      "loss": 0.6073,
+      "step": 16058
+    },
+    {
+      "epoch": 0.23232542020447064,
+      "grad_norm": 0.8820034265518188,
+      "learning_rate": 2.481713668624899e-05,
+      "loss": 0.6042,
+      "step": 16089
+    },
+    {
+      "epoch": 0.23277306070582798,
+      "grad_norm": 0.873534619808197,
+      "learning_rate": 2.4733157703746663e-05,
+      "loss": 0.6115,
+      "step": 16120
+    },
+    {
+      "epoch": 0.23322070120718535,
+      "grad_norm": 1.0529483556747437,
+      "learning_rate": 2.4649181732523392e-05,
+      "loss": 0.604,
+      "step": 16151
+    },
+    {
+      "epoch": 0.23366834170854273,
+      "grad_norm": 1.0236808061599731,
+      "learning_rate": 2.4565209720236582e-05,
+      "loss": 0.6109,
+      "step": 16182
+    },
+    {
+      "epoch": 0.23411598220990007,
+      "grad_norm": 0.926750898361206,
+      "learning_rate": 2.4481242614498975e-05,
+      "loss": 0.6103,
+      "step": 16213
+    },
+    {
+      "epoch": 0.23456362271125744,
+      "grad_norm": 0.9616347551345825,
+      "learning_rate": 2.439728136286796e-05,
+      "loss": 0.6115,
+      "step": 16244
+    },
+    {
+      "epoch": 0.23501126321261478,
+      "grad_norm": 0.9181815981864929,
+      "learning_rate": 2.4313326912834852e-05,
+      "loss": 0.6093,
+      "step": 16275
+    },
+    {
+      "epoch": 0.23545890371397216,
+      "grad_norm": 1.08785879611969,
+      "learning_rate": 2.4229380211814206e-05,
+      "loss": 0.6056,
+      "step": 16306
+    },
+    {
+      "epoch": 0.23590654421532953,
+      "grad_norm": 1.3456270694732666,
+      "learning_rate": 2.4145442207133124e-05,
+      "loss": 0.5999,
+      "step": 16337
+    },
+    {
+      "epoch": 0.23635418471668687,
+      "grad_norm": 0.9001489281654358,
+      "learning_rate": 2.406151384602059e-05,
+      "loss": 0.6147,
+      "step": 16368
+    },
+    {
+      "epoch": 0.23680182521804424,
+      "grad_norm": 0.9228829145431519,
+      "learning_rate": 2.3977596075596747e-05,
+      "loss": 0.6089,
+      "step": 16399
+    },
+    {
+      "epoch": 0.23724946571940161,
+      "grad_norm": 0.8185672760009766,
+      "learning_rate": 2.3893689842862223e-05,
+      "loss": 0.6064,
+      "step": 16430
+    },
+    {
+      "epoch": 0.23769710622075896,
+      "grad_norm": 0.848855197429657,
+      "learning_rate": 2.3809796094687475e-05,
+      "loss": 0.6078,
+      "step": 16461
+    },
+    {
+      "epoch": 0.23814474672211633,
+      "grad_norm": 1.5285366773605347,
+      "learning_rate": 2.372591577780202e-05,
+      "loss": 0.6016,
+      "step": 16492
+    },
+    {
+      "epoch": 0.2385923872234737,
+      "grad_norm": 1.0771571397781372,
+      "learning_rate": 2.3642049838783838e-05,
+      "loss": 0.6132,
+      "step": 16523
+    },
+    {
+      "epoch": 0.23904002772483104,
+      "grad_norm": 0.8987991809844971,
+      "learning_rate": 2.3558199224048666e-05,
+      "loss": 0.6098,
+      "step": 16554
+    },
+    {
+      "epoch": 0.23948766822618842,
+      "grad_norm": 0.8981488943099976,
+      "learning_rate": 2.347436487983929e-05,
+      "loss": 0.6168,
+      "step": 16585
+    },
+    {
+      "epoch": 0.2399353087275458,
+      "grad_norm": 0.9029120802879333,
+      "learning_rate": 2.3390547752214888e-05,
+      "loss": 0.6116,
+      "step": 16616
+    },
+    {
+      "epoch": 0.24038294922890313,
+      "grad_norm": 1.0437650680541992,
+      "learning_rate": 2.330674878704035e-05,
+      "loss": 0.599,
+      "step": 16647
+    },
+    {
+      "epoch": 0.2408305897302605,
+      "grad_norm": 0.9616511464118958,
+      "learning_rate": 2.322296892997561e-05,
+      "loss": 0.614,
+      "step": 16678
+    },
+    {
+      "epoch": 0.24127823023161785,
+      "grad_norm": 0.8985153436660767,
+      "learning_rate": 2.313920912646497e-05,
+      "loss": 0.6087,
+      "step": 16709
+    },
+    {
+      "epoch": 0.24172587073297522,
+      "grad_norm": 1.0448508262634277,
+      "learning_rate": 2.305547032172643e-05,
+      "loss": 0.6062,
+      "step": 16740
+    },
+    {
+      "epoch": 0.2421735112343326,
+      "grad_norm": 0.9185760021209717,
+      "learning_rate": 2.2971753460741014e-05,
+      "loss": 0.6099,
+      "step": 16771
+    },
+    {
+      "epoch": 0.24262115173568993,
+      "grad_norm": 1.1951557397842407,
+      "learning_rate": 2.288805948824212e-05,
+      "loss": 0.6091,
+      "step": 16802
+    },
+    {
+      "epoch": 0.2430687922370473,
+      "grad_norm": 0.8947639465332031,
+      "learning_rate": 2.2804389348704858e-05,
+      "loss": 0.6101,
+      "step": 16833
+    },
+    {
+      "epoch": 0.24351643273840468,
+      "grad_norm": 1.0335516929626465,
+      "learning_rate": 2.2720743986335374e-05,
+      "loss": 0.6053,
+      "step": 16864
+    },
+    {
+      "epoch": 0.24396407323976202,
+      "grad_norm": 0.9719113111495972,
+      "learning_rate": 2.2637124345060233e-05,
+      "loss": 0.6093,
+      "step": 16895
+    },
+    {
+      "epoch": 0.2444117137411194,
+      "grad_norm": 0.9017343521118164,
+      "learning_rate": 2.2553531368515695e-05,
+      "loss": 0.6096,
+      "step": 16926
+    },
+    {
+      "epoch": 0.24485935424247676,
+      "grad_norm": 0.9254065155982971,
+      "learning_rate": 2.2469966000037144e-05,
+      "loss": 0.6031,
+      "step": 16957
+    },
+    {
+      "epoch": 0.2453069947438341,
+      "grad_norm": 0.9550548195838928,
+      "learning_rate": 2.2386429182648417e-05,
+      "loss": 0.6136,
+      "step": 16988
+    },
+    {
+      "epoch": 0.24575463524519148,
+      "grad_norm": 0.913746178150177,
+      "learning_rate": 2.230292185905114e-05,
+      "loss": 0.6041,
+      "step": 17019
+    },
+    {
+      "epoch": 0.24620227574654885,
+      "grad_norm": 1.0998092889785767,
+      "learning_rate": 2.2219444971614116e-05,
+      "loss": 0.6031,
+      "step": 17050
+    },
+    {
+      "epoch": 0.2466499162479062,
+      "grad_norm": 0.8995510339736938,
+      "learning_rate": 2.2135999462362655e-05,
+      "loss": 0.6043,
+      "step": 17081
+    },
+    {
+      "epoch": 0.24709755674926356,
+      "grad_norm": 1.0682373046875,
+      "learning_rate": 2.2052586272968003e-05,
+      "loss": 0.6091,
+      "step": 17112
+    },
+    {
+      "epoch": 0.2475451972506209,
+      "grad_norm": 0.9658533334732056,
+      "learning_rate": 2.196920634473666e-05,
+      "loss": 0.6062,
+      "step": 17143
+    },
+    {
+      "epoch": 0.24799283775197828,
+      "grad_norm": 0.9547036290168762,
+      "learning_rate": 2.1885860618599787e-05,
+      "loss": 0.6083,
+      "step": 17174
+    },
+    {
+      "epoch": 0.24844047825333565,
+      "grad_norm": 1.1252254247665405,
+      "learning_rate": 2.1802550035102577e-05,
+      "loss": 0.6047,
+      "step": 17205
+    },
+    {
+      "epoch": 0.248888118754693,
+      "grad_norm": 0.8774239420890808,
+      "learning_rate": 2.171927553439363e-05,
+      "loss": 0.6091,
+      "step": 17236
+    },
+    {
+      "epoch": 0.24933575925605037,
+      "grad_norm": 0.9929160475730896,
+      "learning_rate": 2.1636038056214376e-05,
+      "loss": 0.6037,
+      "step": 17267
+    },
+    {
+      "epoch": 0.24978339975740774,
+      "grad_norm": 1.0022073984146118,
+      "learning_rate": 2.155283853988844e-05,
+      "loss": 0.6106,
+      "step": 17298
+    },
+    {
+      "epoch": 0.2502310402587651,
+      "grad_norm": 0.9709188938140869,
+      "learning_rate": 2.146967792431106e-05,
+      "loss": 0.6043,
+      "step": 17329
+    },
+    {
+      "epoch": 0.25067868076012245,
+      "grad_norm": 0.9158416986465454,
+      "learning_rate": 2.138655714793849e-05,
+      "loss": 0.6002,
+      "step": 17360
+    },
+    {
+      "epoch": 0.2511263212614798,
+      "grad_norm": 1.045093059539795,
+      "learning_rate": 2.1303477148777367e-05,
+      "loss": 0.6027,
+      "step": 17391
+    },
+    {
+      "epoch": 0.2515739617628372,
+      "grad_norm": 0.9029024243354797,
+      "learning_rate": 2.122043886437421e-05,
+      "loss": 0.6095,
+      "step": 17422
+    },
+    {
+      "epoch": 0.2520216022641945,
+      "grad_norm": 1.0147509574890137,
+      "learning_rate": 2.1137443231804765e-05,
+      "loss": 0.6072,
+      "step": 17453
+    },
+    {
+      "epoch": 0.2524692427655519,
+      "grad_norm": 0.9794949293136597,
+      "learning_rate": 2.105449118766347e-05,
+      "loss": 0.6048,
+      "step": 17484
+    },
+    {
+      "epoch": 0.25291688326690925,
+      "grad_norm": 1.186495304107666,
+      "learning_rate": 2.097158366805287e-05,
+      "loss": 0.6079,
+      "step": 17515
+    },
+    {
+      "epoch": 0.2533645237682666,
+      "grad_norm": 0.9781451225280762,
+      "learning_rate": 2.0888721608573047e-05,
+      "loss": 0.6033,
+      "step": 17546
+    },
+    {
+      "epoch": 0.253812164269624,
+      "grad_norm": 0.9464316964149475,
+      "learning_rate": 2.0805905944311087e-05,
+      "loss": 0.6057,
+      "step": 17577
+    },
+    {
+      "epoch": 0.25425980477098137,
+      "grad_norm": 0.9456629753112793,
+      "learning_rate": 2.0723137609830497e-05,
+      "loss": 0.6039,
+      "step": 17608
+    },
+    {
+      "epoch": 0.2547074452723387,
+      "grad_norm": 0.9119940400123596,
+      "learning_rate": 2.0640417539160686e-05,
+      "loss": 0.6059,
+      "step": 17639
+    },
+    {
+      "epoch": 0.25515508577369606,
+      "grad_norm": 1.1009196043014526,
+      "learning_rate": 2.0557746665786427e-05,
+      "loss": 0.6081,
+      "step": 17670
+    },
+    {
+      "epoch": 0.2556027262750534,
+      "grad_norm": 1.010501503944397,
+      "learning_rate": 2.0475125922637256e-05,
+      "loss": 0.6081,
+      "step": 17701
+    },
+    {
+      "epoch": 0.2560503667764108,
+      "grad_norm": 0.9617831707000732,
+      "learning_rate": 2.0392556242077047e-05,
+      "loss": 0.6066,
+      "step": 17732
+    },
+    {
+      "epoch": 0.25649800727776817,
+      "grad_norm": 1.0574779510498047,
+      "learning_rate": 2.031003855589343e-05,
+      "loss": 0.6025,
+      "step": 17763
+    },
+    {
+      "epoch": 0.2569456477791255,
+      "grad_norm": 0.9515939950942993,
+      "learning_rate": 2.022757379528727e-05,
+      "loss": 0.6147,
+      "step": 17794
+    },
+    {
+      "epoch": 0.25739328828048286,
+      "grad_norm": 0.8629471659660339,
+      "learning_rate": 2.0145162890862184e-05,
+      "loss": 0.6018,
+      "step": 17825
+    },
+    {
+      "epoch": 0.25784092878184023,
+      "grad_norm": 1.0973188877105713,
+      "learning_rate": 2.0062806772614022e-05,
+      "loss": 0.5974,
+      "step": 17856
+    },
+    {
+      "epoch": 0.2582885692831976,
+      "grad_norm": 1.0111137628555298,
+      "learning_rate": 1.9980506369920392e-05,
+      "loss": 0.6007,
+      "step": 17887
+    },
+    {
+      "epoch": 0.25873620978455497,
+      "grad_norm": 0.938352644443512,
+      "learning_rate": 1.989826261153015e-05,
+      "loss": 0.6062,
+      "step": 17918
+    },
+    {
+      "epoch": 0.25918385028591234,
+      "grad_norm": 0.8754394054412842,
+      "learning_rate": 1.9816076425552923e-05,
+      "loss": 0.5999,
+      "step": 17949
+    },
+    {
+      "epoch": 0.25963149078726966,
+      "grad_norm": 0.9272274374961853,
+      "learning_rate": 1.9733948739448676e-05,
+      "loss": 0.5958,
+      "step": 17980
+    },
+    {
+      "epoch": 0.26007913128862703,
+      "grad_norm": 0.9161437749862671,
+      "learning_rate": 1.9651880480017155e-05,
+      "loss": 0.6068,
+      "step": 18011
+    },
+    {
+      "epoch": 0.2605267717899844,
+      "grad_norm": 1.0073903799057007,
+      "learning_rate": 1.9569872573387516e-05,
+      "loss": 0.6075,
+      "step": 18042
+    },
+    {
+      "epoch": 0.2609744122913418,
+      "grad_norm": 0.8590899705886841,
+      "learning_rate": 1.9487925945007854e-05,
+      "loss": 0.6017,
+      "step": 18073
+    },
+    {
+      "epoch": 0.26142205279269914,
+      "grad_norm": 0.9320747256278992,
+      "learning_rate": 1.9406041519634726e-05,
+      "loss": 0.6025,
+      "step": 18104
+    },
+    {
+      "epoch": 0.2618696932940565,
+      "grad_norm": 1.000109076499939,
+      "learning_rate": 1.932422022132275e-05,
+      "loss": 0.6025,
+      "step": 18135
+    },
+    {
+      "epoch": 0.26231733379541383,
+      "grad_norm": 0.8880858421325684,
+      "learning_rate": 1.924246297341414e-05,
+      "loss": 0.6029,
+      "step": 18166
+    },
+    {
+      "epoch": 0.2627649742967712,
+      "grad_norm": 0.9839984178543091,
+      "learning_rate": 1.9160770698528338e-05,
+      "loss": 0.6009,
+      "step": 18197
+    },
+    {
+      "epoch": 0.2632126147981286,
+      "grad_norm": 0.9712537527084351,
+      "learning_rate": 1.907914431855156e-05,
+      "loss": 0.6005,
+      "step": 18228
+    },
+    {
+      "epoch": 0.26366025529948595,
+      "grad_norm": 0.9593982100486755,
+      "learning_rate": 1.8997584754626412e-05,
+      "loss": 0.5967,
+      "step": 18259
+    },
+    {
+      "epoch": 0.2641078958008433,
+      "grad_norm": 0.9100329279899597,
+      "learning_rate": 1.8916092927141486e-05,
+      "loss": 0.5926,
+      "step": 18290
+    },
+    {
+      "epoch": 0.26455553630220063,
+      "grad_norm": 0.8858036398887634,
+      "learning_rate": 1.883466975572098e-05,
+      "loss": 0.5972,
+      "step": 18321
+    },
+    {
+      "epoch": 0.265003176803558,
+      "grad_norm": 1.0127744674682617,
+      "learning_rate": 1.8753316159214312e-05,
+      "loss": 0.6029,
+      "step": 18352
+    },
+    {
+      "epoch": 0.2654508173049154,
+      "grad_norm": 0.9447472095489502,
+      "learning_rate": 1.8672033055685766e-05,
+      "loss": 0.6066,
+      "step": 18383
+    },
+    {
+      "epoch": 0.26589845780627275,
+      "grad_norm": 0.818134605884552,
+      "learning_rate": 1.8590821362404116e-05,
+      "loss": 0.5953,
+      "step": 18414
+    },
+    {
+      "epoch": 0.2663460983076301,
+      "grad_norm": 0.9979908466339111,
+      "learning_rate": 1.8509681995832294e-05,
+      "loss": 0.5978,
+      "step": 18445
+    },
+    {
+      "epoch": 0.2667937388089875,
+      "grad_norm": 0.8588074445724487,
+      "learning_rate": 1.8428615871617004e-05,
+      "loss": 0.6004,
+      "step": 18476
+    },
+    {
+      "epoch": 0.2672413793103448,
+      "grad_norm": 0.9643010497093201,
+      "learning_rate": 1.8347623904578448e-05,
+      "loss": 0.6071,
+      "step": 18507
+    },
+    {
+      "epoch": 0.2676890198117022,
+      "grad_norm": 0.8365680575370789,
+      "learning_rate": 1.8266707008699975e-05,
+      "loss": 0.5998,
+      "step": 18538
+    },
+    {
+      "epoch": 0.26813666031305955,
+      "grad_norm": 0.8986954689025879,
+      "learning_rate": 1.818586609711774e-05,
+      "loss": 0.5982,
+      "step": 18569
+    },
+    {
+      "epoch": 0.2685843008144169,
+      "grad_norm": 1.0341336727142334,
+      "learning_rate": 1.8105102082110462e-05,
+      "loss": 0.6008,
+      "step": 18600
+    },
+    {
+      "epoch": 0.2690319413157743,
+      "grad_norm": 1.0030567646026611,
+      "learning_rate": 1.8024415875089058e-05,
+      "loss": 0.6011,
+      "step": 18631
+    },
+    {
+      "epoch": 0.2694795818171316,
+      "grad_norm": 0.9385823607444763,
+      "learning_rate": 1.7943808386586407e-05,
+      "loss": 0.6077,
+      "step": 18662
+    },
+    {
+      "epoch": 0.269927222318489,
+      "grad_norm": 0.8827871680259705,
+      "learning_rate": 1.7863280526247073e-05,
+      "loss": 0.6073,
+      "step": 18693
+    },
+    {
+      "epoch": 0.27037486281984635,
+      "grad_norm": 0.9739916324615479,
+      "learning_rate": 1.7782833202817003e-05,
+      "loss": 0.595,
+      "step": 18724
+    },
+    {
+      "epoch": 0.2708225033212037,
+      "grad_norm": 0.9108980298042297,
+      "learning_rate": 1.7702467324133327e-05,
+      "loss": 0.587,
+      "step": 18755
+    },
+    {
+      "epoch": 0.2712701438225611,
+      "grad_norm": 1.0579863786697388,
+      "learning_rate": 1.7622183797114042e-05,
+      "loss": 0.6043,
+      "step": 18786
+    },
+    {
+      "epoch": 0.27171778432391847,
+      "grad_norm": 0.9881874322891235,
+      "learning_rate": 1.7541983527747838e-05,
+      "loss": 0.5905,
+      "step": 18817
+    },
+    {
+      "epoch": 0.2721654248252758,
+      "grad_norm": 0.9560896158218384,
+      "learning_rate": 1.746186742108387e-05,
+      "loss": 0.6033,
+      "step": 18848
+    },
+    {
+      "epoch": 0.27261306532663315,
+      "grad_norm": 0.9506632685661316,
+      "learning_rate": 1.73818363812215e-05,
+      "loss": 0.5935,
+      "step": 18879
+    },
+    {
+      "epoch": 0.2730607058279905,
+      "grad_norm": 0.9935999512672424,
+      "learning_rate": 1.7301891311300153e-05,
+      "loss": 0.5997,
+      "step": 18910
+    },
+    {
+      "epoch": 0.2735083463293479,
+      "grad_norm": 0.9102685451507568,
+      "learning_rate": 1.7222033113489055e-05,
+      "loss": 0.5982,
+      "step": 18941
+    },
+    {
+      "epoch": 0.27395598683070527,
+      "grad_norm": 1.0436829328536987,
+      "learning_rate": 1.7142262688977127e-05,
+      "loss": 0.603,
+      "step": 18972
+    },
+    {
+      "epoch": 0.27440362733206264,
+      "grad_norm": 1.0441209077835083,
+      "learning_rate": 1.7062580937962764e-05,
+      "loss": 0.5957,
+      "step": 19003
+    },
+    {
+      "epoch": 0.27485126783341995,
+      "grad_norm": 0.9903119206428528,
+      "learning_rate": 1.698298875964369e-05,
+      "loss": 0.5972,
+      "step": 19034
+    },
+    {
+      "epoch": 0.2752989083347773,
+      "grad_norm": 0.8914598226547241,
+      "learning_rate": 1.690348705220684e-05,
+      "loss": 0.6027,
+      "step": 19065
+    },
+    {
+      "epoch": 0.2757465488361347,
+      "grad_norm": 0.9678306579589844,
+      "learning_rate": 1.6824076712818156e-05,
+      "loss": 0.5999,
+      "step": 19096
+    },
+    {
+      "epoch": 0.27619418933749207,
+      "grad_norm": 0.9159491658210754,
+      "learning_rate": 1.6744758637612533e-05,
+      "loss": 0.6029,
+      "step": 19127
+    },
+    {
+      "epoch": 0.27664182983884944,
+      "grad_norm": 1.0948030948638916,
+      "learning_rate": 1.6665533721683664e-05,
+      "loss": 0.604,
+      "step": 19158
+    },
+    {
+      "epoch": 0.27708947034020676,
+      "grad_norm": 0.866001546382904,
+      "learning_rate": 1.6586402859073974e-05,
+      "loss": 0.5863,
+      "step": 19189
+    },
+    {
+      "epoch": 0.2775371108415641,
+      "grad_norm": 1.0278693437576294,
+      "learning_rate": 1.6507366942764463e-05,
+      "loss": 0.5937,
+      "step": 19220
+    },
+    {
+      "epoch": 0.2779847513429215,
+      "grad_norm": 0.9074748158454895,
+      "learning_rate": 1.6428426864664732e-05,
+      "loss": 0.602,
+      "step": 19251
+    },
+    {
+      "epoch": 0.27843239184427887,
+      "grad_norm": 0.9951406717300415,
+      "learning_rate": 1.6349583515602816e-05,
+      "loss": 0.5982,
+      "step": 19282
+    },
+    {
+      "epoch": 0.27888003234563624,
+      "grad_norm": 1.0565474033355713,
+      "learning_rate": 1.6270837785315208e-05,
+      "loss": 0.6008,
+      "step": 19313
+    },
+    {
+      "epoch": 0.2793276728469936,
+      "grad_norm": 0.9266191124916077,
+      "learning_rate": 1.619219056243676e-05,
+      "loss": 0.5994,
+      "step": 19344
+    },
+    {
+      "epoch": 0.27977531334835093,
+      "grad_norm": 0.8990464806556702,
+      "learning_rate": 1.6113642734490698e-05,
+      "loss": 0.5984,
+      "step": 19375
+    },
+    {
+      "epoch": 0.2802229538497083,
+      "grad_norm": 0.9231170415878296,
+      "learning_rate": 1.6035195187878577e-05,
+      "loss": 0.5952,
+      "step": 19406
+    },
+    {
+      "epoch": 0.28067059435106567,
+      "grad_norm": 1.035946011543274,
+      "learning_rate": 1.5956848807870305e-05,
+      "loss": 0.5985,
+      "step": 19437
+    },
+    {
+      "epoch": 0.28111823485242304,
+      "grad_norm": 0.8787546157836914,
+      "learning_rate": 1.587860447859413e-05,
+      "loss": 0.5999,
+      "step": 19468
+    },
+    {
+      "epoch": 0.2815658753537804,
+      "grad_norm": 0.8387063145637512,
+      "learning_rate": 1.5800463083026686e-05,
+      "loss": 0.5973,
+      "step": 19499
+    },
+    {
+      "epoch": 0.28201351585513773,
+      "grad_norm": 1.025985598564148,
+      "learning_rate": 1.572242550298298e-05,
+      "loss": 0.597,
+      "step": 19530
+    },
+    {
+      "epoch": 0.2824611563564951,
+      "grad_norm": 0.9072343707084656,
+      "learning_rate": 1.56444926191065e-05,
+      "loss": 0.5868,
+      "step": 19561
+    },
+    {
+      "epoch": 0.2829087968578525,
+      "grad_norm": 0.9914515614509583,
+      "learning_rate": 1.5566665310859257e-05,
+      "loss": 0.5926,
+      "step": 19592
+    },
+    {
+      "epoch": 0.28335643735920985,
+      "grad_norm": 0.9568142294883728,
+      "learning_rate": 1.5488944456511846e-05,
+      "loss": 0.6023,
+      "step": 19623
+    },
+    {
+      "epoch": 0.2838040778605672,
+      "grad_norm": 0.8508808016777039,
+      "learning_rate": 1.5411330933133546e-05,
+      "loss": 0.5991,
+      "step": 19654
+    },
+    {
+      "epoch": 0.2842517183619246,
+      "grad_norm": 0.9583558440208435,
+      "learning_rate": 1.533382561658241e-05,
+      "loss": 0.6031,
+      "step": 19685
+    },
+    {
+      "epoch": 0.2846993588632819,
+      "grad_norm": 0.9079626798629761,
+      "learning_rate": 1.525642938149541e-05,
+      "loss": 0.6021,
+      "step": 19716
+    },
+    {
+      "epoch": 0.2851469993646393,
+      "grad_norm": 0.8839224576950073,
+      "learning_rate": 1.5179143101278536e-05,
+      "loss": 0.5974,
+      "step": 19747
+    },
+    {
+      "epoch": 0.28559463986599665,
+      "grad_norm": 0.9244747161865234,
+      "learning_rate": 1.5101967648096955e-05,
+      "loss": 0.5951,
+      "step": 19778
+    },
+    {
+      "epoch": 0.286042280367354,
+      "grad_norm": 0.937430202960968,
+      "learning_rate": 1.5024903892865172e-05,
+      "loss": 0.5981,
+      "step": 19809
+    },
+    {
+      "epoch": 0.2864899208687114,
+      "grad_norm": 1.0390359163284302,
+      "learning_rate": 1.4947952705237184e-05,
+      "loss": 0.6017,
+      "step": 19840
+    },
+    {
+      "epoch": 0.28693756137006876,
+      "grad_norm": 0.9726883172988892,
+      "learning_rate": 1.4871114953596682e-05,
+      "loss": 0.5956,
+      "step": 19871
+    },
+    {
+      "epoch": 0.2873852018714261,
+      "grad_norm": 0.8611225485801697,
+      "learning_rate": 1.4794391505047256e-05,
+      "loss": 0.5875,
+      "step": 19902
+    },
+    {
+      "epoch": 0.28783284237278345,
+      "grad_norm": 0.9599292278289795,
+      "learning_rate": 1.4717783225402596e-05,
+      "loss": 0.5948,
+      "step": 19933
+    },
+    {
+      "epoch": 0.2882804828741408,
+      "grad_norm": 0.9473167061805725,
+      "learning_rate": 1.4641290979176735e-05,
+      "loss": 0.5967,
+      "step": 19964
+    },
+    {
+      "epoch": 0.2887281233754982,
+      "grad_norm": 0.9631912708282471,
+      "learning_rate": 1.4564915629574246e-05,
+      "loss": 0.5962,
+      "step": 19995
+    },
+    {
+      "epoch": 0.28917576387685556,
+      "grad_norm": 0.9674975872039795,
+      "learning_rate": 1.4488658038480601e-05,
+      "loss": 0.59,
+      "step": 20026
+    },
+    {
+      "epoch": 0.2896234043782129,
+      "grad_norm": 1.1209561824798584,
+      "learning_rate": 1.4412519066452323e-05,
+      "loss": 0.6032,
+      "step": 20057
+    },
+    {
+      "epoch": 0.29007104487957025,
+      "grad_norm": 0.9360538125038147,
+      "learning_rate": 1.4336499572707373e-05,
+      "loss": 0.5975,
+      "step": 20088
+    },
+    {
+      "epoch": 0.2905186853809276,
+      "grad_norm": 0.9791879653930664,
+      "learning_rate": 1.4260600415115433e-05,
+      "loss": 0.6051,
+      "step": 20119
+    },
+    {
+      "epoch": 0.290966325882285,
+      "grad_norm": 1.0199767351150513,
+      "learning_rate": 1.4184822450188137e-05,
+      "loss": 0.5912,
+      "step": 20150
+    },
+    {
+      "epoch": 0.29141396638364236,
+      "grad_norm": 0.8803568482398987,
+      "learning_rate": 1.410916653306954e-05,
+      "loss": 0.6007,
+      "step": 20181
+    },
+    {
+      "epoch": 0.29186160688499974,
+      "grad_norm": 0.9544051289558411,
+      "learning_rate": 1.403363351752639e-05,
+      "loss": 0.5901,
+      "step": 20212
+    },
+    {
+      "epoch": 0.29230924738635705,
+      "grad_norm": 1.0661756992340088,
+      "learning_rate": 1.3958224255938485e-05,
+      "loss": 0.595,
+      "step": 20243
+    },
+    {
+      "epoch": 0.2927568878877144,
+      "grad_norm": 0.9343761801719666,
+      "learning_rate": 1.388293959928911e-05,
+      "loss": 0.6016,
+      "step": 20274
+    },
+    {
+      "epoch": 0.2932045283890718,
+      "grad_norm": 1.0200270414352417,
+      "learning_rate": 1.3807780397155379e-05,
+      "loss": 0.6,
+      "step": 20305
+    },
+    {
+      "epoch": 0.29365216889042917,
+      "grad_norm": 0.8452933430671692,
+      "learning_rate": 1.3732747497698655e-05,
+      "loss": 0.5978,
+      "step": 20336
+    },
+    {
+      "epoch": 0.29409980939178654,
+      "grad_norm": 1.000546932220459,
+      "learning_rate": 1.3657841747655038e-05,
+      "loss": 0.5933,
+      "step": 20367
+    },
+    {
+      "epoch": 0.29454744989314385,
+      "grad_norm": 0.9047265648841858,
+      "learning_rate": 1.3583063992325706e-05,
+      "loss": 0.5954,
+      "step": 20398
+    },
+    {
+      "epoch": 0.2949950903945012,
+      "grad_norm": 0.877160906791687,
+      "learning_rate": 1.3508415075567496e-05,
+      "loss": 0.5921,
+      "step": 20429
+    },
+    {
+      "epoch": 0.2954427308958586,
+      "grad_norm": 1.0855872631072998,
+      "learning_rate": 1.343389583978327e-05,
+      "loss": 0.5992,
+      "step": 20460
+    },
+    {
+      "epoch": 0.29589037139721597,
+      "grad_norm": 1.006057620048523,
+      "learning_rate": 1.3359507125912468e-05,
+      "loss": 0.5916,
+      "step": 20491
+    },
+    {
+      "epoch": 0.29633801189857334,
+      "grad_norm": 1.003037929534912,
+      "learning_rate": 1.3285249773421627e-05,
+      "loss": 0.5918,
+      "step": 20522
+    },
+    {
+      "epoch": 0.2967856523999307,
+      "grad_norm": 0.9983749389648438,
+      "learning_rate": 1.3211124620294884e-05,
+      "loss": 0.5922,
+      "step": 20553
+    },
+    {
+      "epoch": 0.297233292901288,
+      "grad_norm": 1.0387030839920044,
+      "learning_rate": 1.313713250302451e-05,
+      "loss": 0.5991,
+      "step": 20584
+    },
+    {
+      "epoch": 0.2976809334026454,
+      "grad_norm": 0.8586576581001282,
+      "learning_rate": 1.3063274256601479e-05,
+      "loss": 0.6001,
+      "step": 20615
+    },
+    {
+      "epoch": 0.29812857390400277,
+      "grad_norm": 0.9596696496009827,
+      "learning_rate": 1.2989550714506086e-05,
+      "loss": 0.5988,
+      "step": 20646
+    },
+    {
+      "epoch": 0.29857621440536014,
+      "grad_norm": 0.9584054350852966,
+      "learning_rate": 1.291596270869846e-05,
+      "loss": 0.5946,
+      "step": 20677
+    },
+    {
+      "epoch": 0.2990238549067175,
+      "grad_norm": 1.02814519405365,
+      "learning_rate": 1.284251106960927e-05,
+      "loss": 0.5941,
+      "step": 20708
+    },
+    {
+      "epoch": 0.2994714954080749,
+      "grad_norm": 1.1163685321807861,
+      "learning_rate": 1.2769196626130263e-05,
+      "loss": 0.6002,
+      "step": 20739
+    },
+    {
+      "epoch": 0.2999191359094322,
+      "grad_norm": 0.9234864711761475,
+      "learning_rate": 1.2696020205604969e-05,
+      "loss": 0.5919,
+      "step": 20770
+    },
+    {
+      "epoch": 0.30036677641078957,
+      "grad_norm": 0.9402379393577576,
+      "learning_rate": 1.2622982633819359e-05,
+      "loss": 0.5931,
+      "step": 20801
+    },
+    {
+      "epoch": 0.30081441691214694,
+      "grad_norm": 0.9681121110916138,
+      "learning_rate": 1.2550084734992484e-05,
+      "loss": 0.5904,
+      "step": 20832
+    },
+    {
+      "epoch": 0.3012620574135043,
+      "grad_norm": 0.9508892893791199,
+      "learning_rate": 1.247732733176724e-05,
+      "loss": 0.5997,
+      "step": 20863
+    },
+    {
+      "epoch": 0.3017096979148617,
+      "grad_norm": 0.8664924502372742,
+      "learning_rate": 1.2404711245201044e-05,
+      "loss": 0.594,
+      "step": 20894
+    },
+    {
+      "epoch": 0.302157338416219,
+      "grad_norm": 0.8919743299484253,
+      "learning_rate": 1.2332237294756535e-05,
+      "loss": 0.5873,
+      "step": 20925
+    },
+    {
+      "epoch": 0.3026049789175764,
+      "grad_norm": 0.9090976119041443,
+      "learning_rate": 1.225990629829241e-05,
+      "loss": 0.5966,
+      "step": 20956
+    },
+    {
+      "epoch": 0.30305261941893374,
+      "grad_norm": 0.8878434896469116,
+      "learning_rate": 1.2187719072054136e-05,
+      "loss": 0.5939,
+      "step": 20987
+    },
+    {
+      "epoch": 0.3035002599202911,
+      "grad_norm": 0.8897850513458252,
+      "learning_rate": 1.2115676430664735e-05,
+      "loss": 0.5978,
+      "step": 21018
+    },
+    {
+      "epoch": 0.3039479004216485,
+      "grad_norm": 0.8866651654243469,
+      "learning_rate": 1.2043779187115647e-05,
+      "loss": 0.5873,
+      "step": 21049
+    },
+    {
+      "epoch": 0.30439554092300586,
+      "grad_norm": 0.8519348502159119,
+      "learning_rate": 1.1972028152757476e-05,
+      "loss": 0.5991,
+      "step": 21080
+    },
+    {
+      "epoch": 0.3048431814243632,
+      "grad_norm": 1.146201252937317,
+      "learning_rate": 1.1900424137290889e-05,
+      "loss": 0.5928,
+      "step": 21111
+    },
+    {
+      "epoch": 0.30529082192572055,
+      "grad_norm": 1.0777043104171753,
+      "learning_rate": 1.1828967948757482e-05,
+      "loss": 0.5966,
+      "step": 21142
+    },
+    {
+      "epoch": 0.3057384624270779,
+      "grad_norm": 1.0404378175735474,
+      "learning_rate": 1.175766039353062e-05,
+      "loss": 0.607,
+      "step": 21173
+    },
+    {
+      "epoch": 0.3061861029284353,
+      "grad_norm": 0.8684154152870178,
+      "learning_rate": 1.1686502276306382e-05,
+      "loss": 0.5992,
+      "step": 21204
+    },
+    {
+      "epoch": 0.30663374342979266,
+      "grad_norm": 0.9449039101600647,
+      "learning_rate": 1.1615494400094445e-05,
+      "loss": 0.5937,
+      "step": 21235
+    },
+    {
+      "epoch": 0.30708138393115,
+      "grad_norm": 0.9459973573684692,
+      "learning_rate": 1.1544637566209029e-05,
+      "loss": 0.5953,
+      "step": 21266
+    },
+    {
+      "epoch": 0.30752902443250735,
+      "grad_norm": 0.8467513918876648,
+      "learning_rate": 1.1473932574259886e-05,
+      "loss": 0.5937,
+      "step": 21297
+    },
+    {
+      "epoch": 0.3079766649338647,
+      "grad_norm": 0.953157901763916,
+      "learning_rate": 1.1403380222143247e-05,
+      "loss": 0.592,
+      "step": 21328
+    },
+    {
+      "epoch": 0.3084243054352221,
+      "grad_norm": 0.9762019515037537,
+      "learning_rate": 1.1332981306032808e-05,
+      "loss": 0.6009,
+      "step": 21359
+    },
+    {
+      "epoch": 0.30887194593657946,
+      "grad_norm": 0.7931903600692749,
+      "learning_rate": 1.1262736620370762e-05,
+      "loss": 0.5951,
+      "step": 21390
+    },
+    {
+      "epoch": 0.30931958643793683,
+      "grad_norm": 1.042128324508667,
+      "learning_rate": 1.1192646957858854e-05,
+      "loss": 0.5951,
+      "step": 21421
+    },
+    {
+      "epoch": 0.30976722693929415,
+      "grad_norm": 0.9942502379417419,
+      "learning_rate": 1.1122713109449381e-05,
+      "loss": 0.5945,
+      "step": 21452
+    },
+    {
+      "epoch": 0.3102148674406515,
+      "grad_norm": 0.9771155118942261,
+      "learning_rate": 1.105293586433634e-05,
+      "loss": 0.5929,
+      "step": 21483
+    },
+    {
+      "epoch": 0.3106625079420089,
+      "grad_norm": 0.9390444159507751,
+      "learning_rate": 1.0983316009946446e-05,
+      "loss": 0.5944,
+      "step": 21514
+    },
+    {
+      "epoch": 0.31111014844336626,
+      "grad_norm": 0.9289217591285706,
+      "learning_rate": 1.0913854331930282e-05,
+      "loss": 0.5902,
+      "step": 21545
+    },
+    {
+      "epoch": 0.31155778894472363,
+      "grad_norm": 0.8977670669555664,
+      "learning_rate": 1.0844551614153456e-05,
+      "loss": 0.5951,
+      "step": 21576
+    },
+    {
+      "epoch": 0.312005429446081,
+      "grad_norm": 0.9898940920829773,
+      "learning_rate": 1.0775408638687725e-05,
+      "loss": 0.5997,
+      "step": 21607
+    },
+    {
+      "epoch": 0.3124530699474383,
+      "grad_norm": 0.9756447076797485,
+      "learning_rate": 1.0706426185802165e-05,
+      "loss": 0.5969,
+      "step": 21638
+    },
+    {
+      "epoch": 0.3129007104487957,
+      "grad_norm": 1.0475540161132812,
+      "learning_rate": 1.0637605033954371e-05,
+      "loss": 0.5908,
+      "step": 21669
+    },
+    {
+      "epoch": 0.31334835095015307,
+      "grad_norm": 0.9765790700912476,
+      "learning_rate": 1.05689459597817e-05,
+      "loss": 0.5903,
+      "step": 21700
+    },
+    {
+      "epoch": 0.31379599145151044,
+      "grad_norm": 0.8677023649215698,
+      "learning_rate": 1.050044973809246e-05,
+      "loss": 0.5907,
+      "step": 21731
+    },
+    {
+      "epoch": 0.3142436319528678,
+      "grad_norm": 0.937731921672821,
+      "learning_rate": 1.043211714185722e-05,
+      "loss": 0.603,
+      "step": 21762
+    },
+    {
+      "epoch": 0.3146912724542251,
+      "grad_norm": 0.8233932256698608,
+      "learning_rate": 1.036394894220003e-05,
+      "loss": 0.5955,
+      "step": 21793
+    },
+    {
+      "epoch": 0.3151389129555825,
+      "grad_norm": 1.1260769367218018,
+      "learning_rate": 1.0295945908389751e-05,
+      "loss": 0.605,
+      "step": 21824
+    },
+    {
+      "epoch": 0.31558655345693987,
+      "grad_norm": 0.9366801977157593,
+      "learning_rate": 1.0228108807831393e-05,
+      "loss": 0.5963,
+      "step": 21855
+    },
+    {
+      "epoch": 0.31603419395829724,
+      "grad_norm": 0.871155321598053,
+      "learning_rate": 1.01604384060574e-05,
+      "loss": 0.5876,
+      "step": 21886
+    },
+    {
+      "epoch": 0.3164818344596546,
+      "grad_norm": 0.9532550573348999,
+      "learning_rate": 1.009293546671907e-05,
+      "loss": 0.5863,
+      "step": 21917
+    },
+    {
+      "epoch": 0.316929474961012,
+      "grad_norm": 1.045569658279419,
+      "learning_rate": 1.002560075157791e-05,
+      "loss": 0.5899,
+      "step": 21948
+    },
+    {
+      "epoch": 0.3173771154623693,
+      "grad_norm": 0.9291980862617493,
+      "learning_rate": 9.958435020496995e-06,
+      "loss": 0.5904,
+      "step": 21979
+    },
+    {
+      "epoch": 0.31782475596372667,
+      "grad_norm": 0.8881365060806274,
+      "learning_rate": 9.89143903143249e-06,
+      "loss": 0.5997,
+      "step": 22010
+    },
+    {
+      "epoch": 0.31827239646508404,
+      "grad_norm": 0.9601870179176331,
+      "learning_rate": 9.824613540425038e-06,
+      "loss": 0.5965,
+      "step": 22041
+    },
+    {
+      "epoch": 0.3187200369664414,
+      "grad_norm": 0.8519198298454285,
+      "learning_rate": 9.757959301591197e-06,
+      "loss": 0.5887,
+      "step": 22072
+    },
+    {
+      "epoch": 0.3191676774677988,
+      "grad_norm": 0.9262305498123169,
+      "learning_rate": 9.691477067115017e-06,
+      "loss": 0.5846,
+      "step": 22103
+    },
+    {
+      "epoch": 0.3196153179691561,
+      "grad_norm": 1.0259448289871216,
+      "learning_rate": 9.625167587239467e-06,
+      "loss": 0.5865,
+      "step": 22134
+    },
+    {
+      "epoch": 0.32006295847051347,
+      "grad_norm": 0.9057780504226685,
+      "learning_rate": 9.559031610258007e-06,
+      "loss": 0.592,
+      "step": 22165
+    },
+    {
+      "epoch": 0.32051059897187084,
+      "grad_norm": 0.905604362487793,
+      "learning_rate": 9.493069882506164e-06,
+      "loss": 0.5958,
+      "step": 22196
+    },
+    {
+      "epoch": 0.3209582394732282,
+      "grad_norm": 0.8837811946868896,
+      "learning_rate": 9.427283148353056e-06,
+      "loss": 0.5955,
+      "step": 22227
+    },
+    {
+      "epoch": 0.3214058799745856,
+      "grad_norm": 0.9125133752822876,
+      "learning_rate": 9.361672150193052e-06,
+      "loss": 0.5915,
+      "step": 22258
+    },
+    {
+      "epoch": 0.32185352047594296,
+      "grad_norm": 0.8553541898727417,
+      "learning_rate": 9.29623762843734e-06,
+      "loss": 0.586,
+      "step": 22289
+    },
+    {
+      "epoch": 0.32230116097730027,
+      "grad_norm": 0.8609781861305237,
+      "learning_rate": 9.230980321505594e-06,
+      "loss": 0.5867,
+      "step": 22320
+    },
+    {
+      "epoch": 0.32274880147865764,
+      "grad_norm": 0.8896780014038086,
+      "learning_rate": 9.165900965817668e-06,
+      "loss": 0.5862,
+      "step": 22351
+    },
+    {
+      "epoch": 0.323196441980015,
+      "grad_norm": 1.0318437814712524,
+      "learning_rate": 9.101000295785245e-06,
+      "loss": 0.5906,
+      "step": 22382
+    },
+    {
+      "epoch": 0.3236440824813724,
+      "grad_norm": 1.0346667766571045,
+      "learning_rate": 9.036279043803565e-06,
+      "loss": 0.594,
+      "step": 22413
+    },
+    {
+      "epoch": 0.32409172298272976,
+      "grad_norm": 0.899023175239563,
+      "learning_rate": 8.971737940243147e-06,
+      "loss": 0.5983,
+      "step": 22444
+    },
+    {
+      "epoch": 0.32453936348408713,
+      "grad_norm": 0.8427733182907104,
+      "learning_rate": 8.907377713441592e-06,
+      "loss": 0.5928,
+      "step": 22475
+    },
+    {
+      "epoch": 0.32498700398544444,
+      "grad_norm": 0.8469851613044739,
+      "learning_rate": 8.843199089695293e-06,
+      "loss": 0.5867,
+      "step": 22506
+    },
+    {
+      "epoch": 0.3254346444868018,
+      "grad_norm": 0.8703016638755798,
+      "learning_rate": 8.779202793251311e-06,
+      "loss": 0.5894,
+      "step": 22537
+    },
+    {
+      "epoch": 0.3258822849881592,
+      "grad_norm": 0.9438649415969849,
+      "learning_rate": 8.715389546299149e-06,
+      "loss": 0.5949,
+      "step": 22568
+    },
+    {
+      "epoch": 0.32632992548951656,
+      "grad_norm": 0.8361387848854065,
+      "learning_rate": 8.651760068962617e-06,
+      "loss": 0.5956,
+      "step": 22599
+    },
+    {
+      "epoch": 0.32677756599087393,
+      "grad_norm": 0.8810434341430664,
+      "learning_rate": 8.588315079291733e-06,
+      "loss": 0.5904,
+      "step": 22630
+    },
+    {
+      "epoch": 0.32722520649223125,
+      "grad_norm": 0.9140039682388306,
+      "learning_rate": 8.52505529325457e-06,
+      "loss": 0.5871,
+      "step": 22661
+    },
+    {
+      "epoch": 0.3276728469935886,
+      "grad_norm": 0.8848084211349487,
+      "learning_rate": 8.461981424729216e-06,
+      "loss": 0.5973,
+      "step": 22692
+    },
+    {
+      "epoch": 0.328120487494946,
+      "grad_norm": 0.8551177382469177,
+      "learning_rate": 8.399094185495725e-06,
+      "loss": 0.5925,
+      "step": 22723
+    },
+    {
+      "epoch": 0.32856812799630336,
+      "grad_norm": 0.9873132705688477,
+      "learning_rate": 8.336394285228017e-06,
+      "loss": 0.592,
+      "step": 22754
+    },
+    {
+      "epoch": 0.32901576849766073,
+      "grad_norm": 0.9582761526107788,
+      "learning_rate": 8.273882431485952e-06,
+      "loss": 0.5957,
+      "step": 22785
+    },
+    {
+      "epoch": 0.3294634089990181,
+      "grad_norm": 0.9337429404258728,
+      "learning_rate": 8.211559329707316e-06,
+      "loss": 0.5893,
+      "step": 22816
+    },
+    {
+      "epoch": 0.3299110495003754,
+      "grad_norm": 0.8926681280136108,
+      "learning_rate": 8.149425683199823e-06,
+      "loss": 0.593,
+      "step": 22847
+    },
+    {
+      "epoch": 0.3303586900017328,
+      "grad_norm": 0.8568328022956848,
+      "learning_rate": 8.08748219313325e-06,
+      "loss": 0.5895,
+      "step": 22878
+    },
+    {
+      "epoch": 0.33080633050309016,
+      "grad_norm": 0.971608579158783,
+      "learning_rate": 8.025729558531453e-06,
+      "loss": 0.5946,
+      "step": 22909
+    },
+    {
+      "epoch": 0.33125397100444753,
+      "grad_norm": 0.9121518135070801,
+      "learning_rate": 7.964168476264508e-06,
+      "loss": 0.592,
+      "step": 22940
+    },
+    {
+      "epoch": 0.3317016115058049,
+      "grad_norm": 0.9045124650001526,
+      "learning_rate": 7.902799641040884e-06,
+      "loss": 0.5915,
+      "step": 22971
+    },
+    {
+      "epoch": 0.3321492520071622,
+      "grad_norm": 0.8435531258583069,
+      "learning_rate": 7.841623745399523e-06,
+      "loss": 0.5902,
+      "step": 23002
+    },
+    {
+      "epoch": 0.3325968925085196,
+      "grad_norm": 0.9218406677246094,
+      "learning_rate": 7.780641479702114e-06,
+      "loss": 0.5949,
+      "step": 23033
+    },
+    {
+      "epoch": 0.33304453300987696,
+      "grad_norm": 0.8910061120986938,
+      "learning_rate": 7.719853532125227e-06,
+      "loss": 0.589,
+      "step": 23064
+    },
+    {
+      "epoch": 0.33349217351123434,
+      "grad_norm": 0.9808411598205566,
+      "learning_rate": 7.65926058865258e-06,
+      "loss": 0.592,
+      "step": 23095
+    },
+    {
+      "epoch": 0.3339398140125917,
+      "grad_norm": 0.9253140091896057,
+      "learning_rate": 7.598863333067313e-06,
+      "loss": 0.5867,
+      "step": 23126
+    },
+    {
+      "epoch": 0.3343874545139491,
+      "grad_norm": 0.8689921498298645,
+      "learning_rate": 7.538662446944253e-06,
+      "loss": 0.5817,
+      "step": 23157
+    },
+    {
+      "epoch": 0.3348350950153064,
+      "grad_norm": 0.9247636198997498,
+      "learning_rate": 7.478658609642211e-06,
+      "loss": 0.5865,
+      "step": 23188
+    },
+    {
+      "epoch": 0.33528273551666377,
+      "grad_norm": 0.99556565284729,
+      "learning_rate": 7.418852498296327e-06,
+      "loss": 0.5936,
+      "step": 23219
+    },
+    {
+      "epoch": 0.33573037601802114,
+      "grad_norm": 0.8474295139312744,
+      "learning_rate": 7.359244787810457e-06,
+      "loss": 0.5846,
+      "step": 23250
+    },
+    {
+      "epoch": 0.3361780165193785,
+      "grad_norm": 0.9600043892860413,
+      "learning_rate": 7.299836150849493e-06,
+      "loss": 0.5858,
+      "step": 23281
+    },
+    {
+      "epoch": 0.3366256570207359,
+      "grad_norm": 0.9534376263618469,
+      "learning_rate": 7.240627257831847e-06,
+      "loss": 0.588,
+      "step": 23312
+    },
+    {
+      "epoch": 0.33707329752209325,
+      "grad_norm": 0.9994376301765442,
+      "learning_rate": 7.1816187769218195e-06,
+      "loss": 0.5924,
+      "step": 23343
+    },
+    {
+      "epoch": 0.33752093802345057,
+      "grad_norm": 0.9056828618049622,
+      "learning_rate": 7.1228113740220895e-06,
+      "loss": 0.5842,
+      "step": 23374
+    },
+    {
+      "epoch": 0.33796857852480794,
+      "grad_norm": 0.9004384875297546,
+      "learning_rate": 7.064205712766226e-06,
+      "loss": 0.5829,
+      "step": 23405
+    },
+    {
+      "epoch": 0.3384162190261653,
+      "grad_norm": 0.9889013171195984,
+      "learning_rate": 7.005802454511129e-06,
+      "loss": 0.5825,
+      "step": 23436
+    },
+    {
+      "epoch": 0.3388638595275227,
+      "grad_norm": 0.8478637933731079,
+      "learning_rate": 6.947602258329639e-06,
+      "loss": 0.5857,
+      "step": 23467
+    },
+    {
+      "epoch": 0.33931150002888005,
+      "grad_norm": 0.825160801410675,
+      "learning_rate": 6.889605781003078e-06,
+      "loss": 0.5922,
+      "step": 23498
+    },
+    {
+      "epoch": 0.33975914053023737,
+      "grad_norm": 0.968928337097168,
+      "learning_rate": 6.831813677013776e-06,
+      "loss": 0.5912,
+      "step": 23529
+    },
+    {
+      "epoch": 0.34020678103159474,
+      "grad_norm": 0.9048051238059998,
+      "learning_rate": 6.774226598537792e-06,
+      "loss": 0.5847,
+      "step": 23560
+    },
+    {
+      "epoch": 0.3406544215329521,
+      "grad_norm": 0.9165478348731995,
+      "learning_rate": 6.716845195437482e-06,
+      "loss": 0.5914,
+      "step": 23591
+    },
+    {
+      "epoch": 0.3411020620343095,
+      "grad_norm": 0.8867752552032471,
+      "learning_rate": 6.659670115254168e-06,
+      "loss": 0.5854,
+      "step": 23622
+    },
+    {
+      "epoch": 0.34154970253566685,
+      "grad_norm": 0.8337939977645874,
+      "learning_rate": 6.602702003200872e-06,
+      "loss": 0.5887,
+      "step": 23653
+    },
+    {
+      "epoch": 0.3419973430370242,
+      "grad_norm": 1.0237765312194824,
+      "learning_rate": 6.545941502154992e-06,
+      "loss": 0.5909,
+      "step": 23684
+    },
+    {
+      "epoch": 0.34244498353838154,
+      "grad_norm": 0.9445424675941467,
+      "learning_rate": 6.489389252651057e-06,
+      "loss": 0.593,
+      "step": 23715
+    },
+    {
+      "epoch": 0.3428926240397389,
+      "grad_norm": 0.874966025352478,
+      "learning_rate": 6.4330458928735325e-06,
+      "loss": 0.5858,
+      "step": 23746
+    },
+    {
+      "epoch": 0.3433402645410963,
+      "grad_norm": 1.0574617385864258,
+      "learning_rate": 6.376912058649559e-06,
+      "loss": 0.5914,
+      "step": 23777
+    },
+    {
+      "epoch": 0.34378790504245366,
+      "grad_norm": 0.8537029027938843,
+      "learning_rate": 6.320988383441845e-06,
+      "loss": 0.5839,
+      "step": 23808
+    },
+    {
+      "epoch": 0.34423554554381103,
+      "grad_norm": 0.9089046716690063,
+      "learning_rate": 6.265275498341452e-06,
+      "loss": 0.5846,
+      "step": 23839
+    },
+    {
+      "epoch": 0.34468318604516834,
+      "grad_norm": 1.014028787612915,
+      "learning_rate": 6.209774032060714e-06,
+      "loss": 0.5888,
+      "step": 23870
+    },
+    {
+      "epoch": 0.3451308265465257,
+      "grad_norm": 1.165295958518982,
+      "learning_rate": 6.1544846109261365e-06,
+      "loss": 0.5879,
+      "step": 23901
+    },
+    {
+      "epoch": 0.3455784670478831,
+      "grad_norm": 0.8614928126335144,
+      "learning_rate": 6.099407858871342e-06,
+      "loss": 0.5849,
+      "step": 23932
+    },
+    {
+      "epoch": 0.34602610754924046,
+      "grad_norm": 0.8964920043945312,
+      "learning_rate": 6.044544397429958e-06,
+      "loss": 0.5845,
+      "step": 23963
+    },
+    {
+      "epoch": 0.34647374805059783,
+      "grad_norm": 0.8400657176971436,
+      "learning_rate": 5.989894845728708e-06,
+      "loss": 0.5826,
+      "step": 23994
+    },
+    {
+      "epoch": 0.3469213885519552,
+      "grad_norm": 0.892620325088501,
+      "learning_rate": 5.9354598204803605e-06,
+      "loss": 0.5825,
+      "step": 24025
+    },
+    {
+      "epoch": 0.3473690290533125,
+      "grad_norm": 0.967951774597168,
+      "learning_rate": 5.881239935976762e-06,
+      "loss": 0.5906,
+      "step": 24056
+    },
+    {
+      "epoch": 0.3478166695546699,
+      "grad_norm": 0.9834921360015869,
+      "learning_rate": 5.827235804081954e-06,
+      "loss": 0.5829,
+      "step": 24087
+    },
+    {
+      "epoch": 0.34826431005602726,
+      "grad_norm": 0.9132741689682007,
+      "learning_rate": 5.773448034225221e-06,
+      "loss": 0.5922,
+      "step": 24118
+    },
+    {
+      "epoch": 0.34871195055738463,
+      "grad_norm": 0.9170505404472351,
+      "learning_rate": 5.719877233394228e-06,
+      "loss": 0.592,
+      "step": 24149
+    },
+    {
+      "epoch": 0.349159591058742,
+      "grad_norm": 0.9209049940109253,
+      "learning_rate": 5.666524006128191e-06,
+      "loss": 0.5831,
+      "step": 24180
+    },
+    {
+      "epoch": 0.3496072315600994,
+      "grad_norm": 0.9120186567306519,
+      "learning_rate": 5.613388954511015e-06,
+      "loss": 0.5855,
+      "step": 24211
+    },
+    {
+      "epoch": 0.3500548720614567,
+      "grad_norm": 0.8754975199699402,
+      "learning_rate": 5.560472678164552e-06,
+      "loss": 0.5837,
+      "step": 24242
+    },
+    {
+      "epoch": 0.35050251256281406,
+      "grad_norm": 0.8505347967147827,
+      "learning_rate": 5.507775774241775e-06,
+      "loss": 0.5917,
+      "step": 24273
+    },
+    {
+      "epoch": 0.35095015306417143,
+      "grad_norm": 0.8900067806243896,
+      "learning_rate": 5.4552988374200945e-06,
+      "loss": 0.586,
+      "step": 24304
+    },
+    {
+      "epoch": 0.3513977935655288,
+      "grad_norm": 0.9413267374038696,
+      "learning_rate": 5.403042459894597e-06,
+      "loss": 0.5842,
+      "step": 24335
+    },
+    {
+      "epoch": 0.3518454340668862,
+      "grad_norm": 0.8651320338249207,
+      "learning_rate": 5.3510072313714135e-06,
+      "loss": 0.5869,
+      "step": 24366
+    },
+    {
+      "epoch": 0.3522930745682435,
+      "grad_norm": 0.8826769590377808,
+      "learning_rate": 5.2991937390610205e-06,
+      "loss": 0.5916,
+      "step": 24397
+    },
+    {
+      "epoch": 0.35274071506960086,
+      "grad_norm": 0.8735561966896057,
+      "learning_rate": 5.247602567671625e-06,
+      "loss": 0.5948,
+      "step": 24428
+    },
+    {
+      "epoch": 0.35318835557095823,
+      "grad_norm": 0.9097628593444824,
+      "learning_rate": 5.196234299402603e-06,
+      "loss": 0.5832,
+      "step": 24459
+    },
+    {
+      "epoch": 0.3536359960723156,
+      "grad_norm": 0.8925328254699707,
+      "learning_rate": 5.145089513937865e-06,
+      "loss": 0.5855,
+      "step": 24490
+    },
+    {
+      "epoch": 0.354083636573673,
+      "grad_norm": 0.8319926857948303,
+      "learning_rate": 5.094168788439369e-06,
+      "loss": 0.596,
+      "step": 24521
+    },
+    {
+      "epoch": 0.35453127707503035,
+      "grad_norm": 0.8705253005027771,
+      "learning_rate": 5.043472697540594e-06,
+      "loss": 0.5807,
+      "step": 24552
+    },
+    {
+      "epoch": 0.35497891757638766,
+      "grad_norm": 0.8858280777931213,
+      "learning_rate": 4.993001813340012e-06,
+      "loss": 0.5922,
+      "step": 24583
+    },
+    {
+      "epoch": 0.35542655807774504,
+      "grad_norm": 0.8770239949226379,
+      "learning_rate": 4.942756705394702e-06,
+      "loss": 0.5959,
+      "step": 24614
+    },
+    {
+      "epoch": 0.3558741985791024,
+      "grad_norm": 0.9590134620666504,
+      "learning_rate": 4.892737940713884e-06,
+      "loss": 0.5944,
+      "step": 24645
+    },
+    {
+      "epoch": 0.3563218390804598,
+      "grad_norm": 0.9162303805351257,
+      "learning_rate": 4.842946083752511e-06,
+      "loss": 0.5858,
+      "step": 24676
+    },
+    {
+      "epoch": 0.35676947958181715,
+      "grad_norm": 1.051709771156311,
+      "learning_rate": 4.79338169640493e-06,
+      "loss": 0.5842,
+      "step": 24707
+    },
+    {
+      "epoch": 0.35721712008317447,
+      "grad_norm": 0.9024022221565247,
+      "learning_rate": 4.74404533799851e-06,
+      "loss": 0.5836,
+      "step": 24738
+    },
+    {
+      "epoch": 0.35766476058453184,
+      "grad_norm": 0.9624181985855103,
+      "learning_rate": 4.694937565287344e-06,
+      "loss": 0.5884,
+      "step": 24769
+    },
+    {
+      "epoch": 0.3581124010858892,
+      "grad_norm": 0.9708924889564514,
+      "learning_rate": 4.646058932445985e-06,
+      "loss": 0.5871,
+      "step": 24800
+    },
+    {
+      "epoch": 0.3585600415872466,
+      "grad_norm": 0.9564160704612732,
+      "learning_rate": 4.597409991063148e-06,
+      "loss": 0.5838,
+      "step": 24831
+    },
+    {
+      "epoch": 0.35900768208860395,
+      "grad_norm": 0.9520463347434998,
+      "learning_rate": 4.5489912901355375e-06,
+      "loss": 0.5891,
+      "step": 24862
+    },
+    {
+      "epoch": 0.3594553225899613,
+      "grad_norm": 0.8748462200164795,
+      "learning_rate": 4.500803376061608e-06,
+      "loss": 0.5871,
+      "step": 24893
+    },
+    {
+      "epoch": 0.35990296309131864,
+      "grad_norm": 0.8078939318656921,
+      "learning_rate": 4.45284679263541e-06,
+      "loss": 0.5828,
+      "step": 24924
+    },
+    {
+      "epoch": 0.360350603592676,
+      "grad_norm": 0.8798905611038208,
+      "learning_rate": 4.4051220810404775e-06,
+      "loss": 0.5945,
+      "step": 24955
+    },
+    {
+      "epoch": 0.3607982440940334,
+      "grad_norm": 1.0122028589248657,
+      "learning_rate": 4.3576297798437025e-06,
+      "loss": 0.5906,
+      "step": 24986
+    },
+    {
+      "epoch": 0.36124588459539075,
+      "grad_norm": 0.8810189962387085,
+      "learning_rate": 4.3103704249892436e-06,
+      "loss": 0.585,
+      "step": 25017
+    },
+    {
+      "epoch": 0.3616935250967481,
+      "grad_norm": 0.9921602606773376,
+      "learning_rate": 4.263344549792487e-06,
+      "loss": 0.5845,
+      "step": 25048
+    },
+    {
+      "epoch": 0.3621411655981055,
+      "grad_norm": 0.9266677498817444,
+      "learning_rate": 4.216552684934056e-06,
+      "loss": 0.5872,
+      "step": 25079
+    },
+    {
+      "epoch": 0.3625888060994628,
+      "grad_norm": 0.9869902729988098,
+      "learning_rate": 4.169995358453777e-06,
+      "loss": 0.5839,
+      "step": 25110
+    },
+    {
+      "epoch": 0.3630364466008202,
+      "grad_norm": 0.908527135848999,
+      "learning_rate": 4.123673095744757e-06,
+      "loss": 0.5816,
+      "step": 25141
+    },
+    {
+      "epoch": 0.36348408710217756,
+      "grad_norm": 0.9539284706115723,
+      "learning_rate": 4.077586419547435e-06,
+      "loss": 0.5895,
+      "step": 25172
+    },
+    {
+      "epoch": 0.3639317276035349,
+      "grad_norm": 0.9441227912902832,
+      "learning_rate": 4.03173584994368e-06,
+      "loss": 0.5818,
+      "step": 25203
+    },
+    {
+      "epoch": 0.3643793681048923,
+      "grad_norm": 0.9245622754096985,
+      "learning_rate": 3.986121904350948e-06,
+      "loss": 0.5894,
+      "step": 25234
+    },
+    {
+      "epoch": 0.3648270086062496,
+      "grad_norm": 0.9282262921333313,
+      "learning_rate": 3.940745097516407e-06,
+      "loss": 0.5867,
+      "step": 25265
+    },
+    {
+      "epoch": 0.365274649107607,
+      "grad_norm": 0.8341302871704102,
+      "learning_rate": 3.89560594151116e-06,
+      "loss": 0.5927,
+      "step": 25296
+    },
+    {
+      "epoch": 0.36572228960896436,
+      "grad_norm": 0.8950303792953491,
+      "learning_rate": 3.850704945724456e-06,
+      "loss": 0.5853,
+      "step": 25327
+    },
+    {
+      "epoch": 0.36616993011032173,
+      "grad_norm": 0.8331125378608704,
+      "learning_rate": 3.8060426168579077e-06,
+      "loss": 0.5913,
+      "step": 25358
+    },
+    {
+      "epoch": 0.3666175706116791,
+      "grad_norm": 0.9764972925186157,
+      "learning_rate": 3.7616194589198407e-06,
+      "loss": 0.5824,
+      "step": 25389
+    },
+    {
+      "epoch": 0.36706521111303647,
+      "grad_norm": 0.8486316800117493,
+      "learning_rate": 3.7174359732195574e-06,
+      "loss": 0.5886,
+      "step": 25420
+    },
+    {
+      "epoch": 0.3675128516143938,
+      "grad_norm": 0.8741101622581482,
+      "learning_rate": 3.673492658361677e-06,
+      "loss": 0.5908,
+      "step": 25451
+    },
+    {
+      "epoch": 0.36796049211575116,
+      "grad_norm": 0.9734169244766235,
+      "learning_rate": 3.6297900102405467e-06,
+      "loss": 0.5755,
+      "step": 25482
+    },
+    {
+      "epoch": 0.36840813261710853,
+      "grad_norm": 0.9409371018409729,
+      "learning_rate": 3.586328522034607e-06,
+      "loss": 0.5851,
+      "step": 25513
+    },
+    {
+      "epoch": 0.3688557731184659,
+      "grad_norm": 0.9707832336425781,
+      "learning_rate": 3.543108684200838e-06,
+      "loss": 0.5846,
+      "step": 25544
+    },
+    {
+      "epoch": 0.3693034136198233,
+      "grad_norm": 0.9187780618667603,
+      "learning_rate": 3.5001309844692464e-06,
+      "loss": 0.5863,
+      "step": 25575
+    },
+    {
+      "epoch": 0.3697510541211806,
+      "grad_norm": 0.8471042513847351,
+      "learning_rate": 3.4573959078373215e-06,
+      "loss": 0.5824,
+      "step": 25606
+    },
+    {
+      "epoch": 0.37019869462253796,
+      "grad_norm": 0.8648544549942017,
+      "learning_rate": 3.4149039365646063e-06,
+      "loss": 0.585,
+      "step": 25637
+    },
+    {
+      "epoch": 0.37064633512389533,
+      "grad_norm": 0.8514716625213623,
+      "learning_rate": 3.3726555501672143e-06,
+      "loss": 0.5853,
+      "step": 25668
+    },
+    {
+      "epoch": 0.3710939756252527,
+      "grad_norm": 0.8402500152587891,
+      "learning_rate": 3.33065122541244e-06,
+      "loss": 0.5827,
+      "step": 25699
+    },
+    {
+      "epoch": 0.3715416161266101,
+      "grad_norm": 0.9909188747406006,
+      "learning_rate": 3.288891436313385e-06,
+      "loss": 0.582,
+      "step": 25730
+    },
+    {
+      "epoch": 0.37198925662796745,
+      "grad_norm": 0.9633392691612244,
+      "learning_rate": 3.2473766541235963e-06,
+      "loss": 0.579,
+      "step": 25761
+    },
+    {
+      "epoch": 0.37243689712932476,
+      "grad_norm": 0.8810042142868042,
+      "learning_rate": 3.2061073473317466e-06,
+      "loss": 0.5849,
+      "step": 25792
+    },
+    {
+      "epoch": 0.37288453763068213,
+      "grad_norm": 0.9136834144592285,
+      "learning_rate": 3.1650839816563444e-06,
+      "loss": 0.5854,
+      "step": 25823
+    },
+    {
+      "epoch": 0.3733321781320395,
+      "grad_norm": 0.9164252877235413,
+      "learning_rate": 3.1243070200405093e-06,
+      "loss": 0.5861,
+      "step": 25854
+    },
+    {
+      "epoch": 0.3737798186333969,
+      "grad_norm": 0.8443343639373779,
+      "learning_rate": 3.0837769226467e-06,
+      "loss": 0.5806,
+      "step": 25885
+    },
+    {
+      "epoch": 0.37422745913475425,
+      "grad_norm": 0.8546344637870789,
+      "learning_rate": 3.0434941468515666e-06,
+      "loss": 0.587,
+      "step": 25916
+    },
+    {
+      "epoch": 0.3746750996361116,
+      "grad_norm": 0.8392828702926636,
+      "learning_rate": 3.003459147240753e-06,
+      "loss": 0.5838,
+      "step": 25947
+    },
+    {
+      "epoch": 0.37512274013746894,
+      "grad_norm": 0.876781165599823,
+      "learning_rate": 2.9636723756037875e-06,
+      "loss": 0.596,
+      "step": 25978
+    },
+    {
+      "epoch": 0.3755703806388263,
+      "grad_norm": 0.9352492094039917,
+      "learning_rate": 2.9241342809289833e-06,
+      "loss": 0.5832,
+      "step": 26009
+    },
+    {
+      "epoch": 0.3760180211401837,
+      "grad_norm": 0.9210870265960693,
+      "learning_rate": 2.8848453093983594e-06,
+      "loss": 0.5791,
+      "step": 26040
+    },
+    {
+      "epoch": 0.37646566164154105,
+      "grad_norm": 0.9076164960861206,
+      "learning_rate": 2.8458059043826257e-06,
+      "loss": 0.5811,
+      "step": 26071
+    },
+    {
+      "epoch": 0.3769133021428984,
+      "grad_norm": 0.8342217803001404,
+      "learning_rate": 2.807016506436172e-06,
+      "loss": 0.5822,
+      "step": 26102
+    },
+    {
+      "epoch": 0.37736094264425574,
+      "grad_norm": 0.9161880612373352,
+      "learning_rate": 2.7684775532920566e-06,
+      "loss": 0.587,
+      "step": 26133
+    },
+    {
+      "epoch": 0.3778085831456131,
+      "grad_norm": 0.8972461223602295,
+      "learning_rate": 2.7301894798571425e-06,
+      "loss": 0.5778,
+      "step": 26164
+    },
+    {
+      "epoch": 0.3782562236469705,
+      "grad_norm": 0.9267194867134094,
+      "learning_rate": 2.6921527182071386e-06,
+      "loss": 0.5824,
+      "step": 26195
+    },
+    {
+      "epoch": 0.37870386414832785,
+      "grad_norm": 0.8432844877243042,
+      "learning_rate": 2.6543676975817223e-06,
+      "loss": 0.5895,
+      "step": 26226
+    },
+    {
+      "epoch": 0.3791515046496852,
+      "grad_norm": 0.8702236413955688,
+      "learning_rate": 2.6168348443797175e-06,
+      "loss": 0.5872,
+      "step": 26257
+    },
+    {
+      "epoch": 0.3795991451510426,
+      "grad_norm": 0.9064545631408691,
+      "learning_rate": 2.5795545821542757e-06,
+      "loss": 0.5792,
+      "step": 26288
+    },
+    {
+      "epoch": 0.3800467856523999,
+      "grad_norm": 0.9529021978378296,
+      "learning_rate": 2.54252733160808e-06,
+      "loss": 0.5827,
+      "step": 26319
+    },
+    {
+      "epoch": 0.3804944261537573,
+      "grad_norm": 1.0374935865402222,
+      "learning_rate": 2.5057535105886294e-06,
+      "loss": 0.5907,
+      "step": 26350
+    },
+    {
+      "epoch": 0.38094206665511465,
+      "grad_norm": 0.8641102910041809,
+      "learning_rate": 2.4692335340834953e-06,
+      "loss": 0.5825,
+      "step": 26381
+    },
+    {
+      "epoch": 0.381389707156472,
+      "grad_norm": 0.9310511350631714,
+      "learning_rate": 2.432967814215639e-06,
+      "loss": 0.5859,
+      "step": 26412
+    },
+    {
+      "epoch": 0.3818373476578294,
+      "grad_norm": 0.8742653131484985,
+      "learning_rate": 2.396956760238794e-06,
+      "loss": 0.5831,
+      "step": 26443
+    },
+    {
+      "epoch": 0.3822849881591867,
+      "grad_norm": 0.9148630499839783,
+      "learning_rate": 2.361200778532796e-06,
+      "loss": 0.5843,
+      "step": 26474
+    },
+    {
+      "epoch": 0.3827326286605441,
+      "grad_norm": 0.7990172505378723,
+      "learning_rate": 2.325700272599049e-06,
+      "loss": 0.5867,
+      "step": 26505
+    },
+    {
+      "epoch": 0.38318026916190145,
+      "grad_norm": 1.0082364082336426,
+      "learning_rate": 2.2904556430559415e-06,
+      "loss": 0.5883,
+      "step": 26536
+    },
+    {
+      "epoch": 0.3836279096632588,
+      "grad_norm": 1.061032772064209,
+      "learning_rate": 2.2554672876343106e-06,
+      "loss": 0.5804,
+      "step": 26567
+    },
+    {
+      "epoch": 0.3840755501646162,
+      "grad_norm": 0.867182195186615,
+      "learning_rate": 2.220735601173002e-06,
+      "loss": 0.5862,
+      "step": 26598
+    },
+    {
+      "epoch": 0.38452319066597357,
+      "grad_norm": 0.9035846590995789,
+      "learning_rate": 2.186260975614382e-06,
+      "loss": 0.5798,
+      "step": 26629
+    },
+    {
+      "epoch": 0.3849708311673309,
+      "grad_norm": 0.9290494322776794,
+      "learning_rate": 2.1520437999999034e-06,
+      "loss": 0.583,
+      "step": 26660
+    },
+    {
+      "epoch": 0.38541847166868826,
+      "grad_norm": 0.8391757011413574,
+      "learning_rate": 2.1180844604657526e-06,
+      "loss": 0.5819,
+      "step": 26691
+    },
+    {
+      "epoch": 0.3858661121700456,
+      "grad_norm": 0.8569139242172241,
+      "learning_rate": 2.084383340238455e-06,
+      "loss": 0.585,
+      "step": 26722
+    },
+    {
+      "epoch": 0.386313752671403,
+      "grad_norm": 0.8917425870895386,
+      "learning_rate": 2.0509408196305704e-06,
+      "loss": 0.5914,
+      "step": 26753
+    },
+    {
+      "epoch": 0.38676139317276037,
+      "grad_norm": 0.8586505651473999,
+      "learning_rate": 2.017757276036403e-06,
+      "loss": 0.5842,
+      "step": 26784
+    },
+    {
+      "epoch": 0.38720903367411774,
+      "grad_norm": 0.8897309303283691,
+      "learning_rate": 1.984833083927726e-06,
+      "loss": 0.58,
+      "step": 26815
+    },
+    {
+      "epoch": 0.38765667417547506,
+      "grad_norm": 0.8957898616790771,
+      "learning_rate": 1.952168614849581e-06,
+      "loss": 0.5804,
+      "step": 26846
+    },
+    {
+      "epoch": 0.38810431467683243,
+      "grad_norm": 0.8361022472381592,
+      "learning_rate": 1.919764237416058e-06,
+      "loss": 0.5764,
+      "step": 26877
+    },
+    {
+      "epoch": 0.3885519551781898,
+      "grad_norm": 0.9136033058166504,
+      "learning_rate": 1.8876203173061463e-06,
+      "loss": 0.5864,
+      "step": 26908
+    },
+    {
+      "epoch": 0.38899959567954717,
+      "grad_norm": 0.8564227819442749,
+      "learning_rate": 1.8557372172596206e-06,
+      "loss": 0.5881,
+      "step": 26939
+    },
+    {
+      "epoch": 0.38944723618090454,
+      "grad_norm": 0.9318363666534424,
+      "learning_rate": 1.8241152970729341e-06,
+      "loss": 0.579,
+      "step": 26970
+    },
+    {
+      "epoch": 0.38989487668226186,
+      "grad_norm": 0.8604468703269958,
+      "learning_rate": 1.7927549135951572e-06,
+      "loss": 0.5795,
+      "step": 27001
+    },
+    {
+      "epoch": 0.39034251718361923,
+      "grad_norm": 0.9203160405158997,
+      "learning_rate": 1.7616564207239477e-06,
+      "loss": 0.5821,
+      "step": 27032
+    },
+    {
+      "epoch": 0.3907901576849766,
+      "grad_norm": 1.0079877376556396,
+      "learning_rate": 1.730820169401584e-06,
+      "loss": 0.5752,
+      "step": 27063
+    },
+    {
+      "epoch": 0.391237798186334,
+      "grad_norm": 0.8484696745872498,
+      "learning_rate": 1.7002465076109558e-06,
+      "loss": 0.5825,
+      "step": 27094
+    },
+    {
+      "epoch": 0.39168543868769135,
+      "grad_norm": 0.969445526599884,
+      "learning_rate": 1.6699357803716898e-06,
+      "loss": 0.5829,
+      "step": 27125
+    },
+    {
+      "epoch": 0.3921330791890487,
+      "grad_norm": 0.902077853679657,
+      "learning_rate": 1.6398883297362305e-06,
+      "loss": 0.5829,
+      "step": 27156
+    },
+    {
+      "epoch": 0.39258071969040603,
+      "grad_norm": 0.9344344735145569,
+      "learning_rate": 1.6101044947859606e-06,
+      "loss": 0.5901,
+      "step": 27187
+    },
+    {
+      "epoch": 0.3930283601917634,
+      "grad_norm": 0.8951054811477661,
+      "learning_rate": 1.5805846116274114e-06,
+      "loss": 0.581,
+      "step": 27218
+    },
+    {
+      "epoch": 0.3934760006931208,
+      "grad_norm": 0.8078446388244629,
+      "learning_rate": 1.5513290133884611e-06,
+      "loss": 0.5862,
+      "step": 27249
+    },
+    {
+      "epoch": 0.39392364119447815,
+      "grad_norm": 0.8421075940132141,
+      "learning_rate": 1.5223380302145512e-06,
+      "loss": 0.59,
+      "step": 27280
+    },
+    {
+      "epoch": 0.3943712816958355,
+      "grad_norm": 0.8337625861167908,
+      "learning_rate": 1.4936119892649925e-06,
+      "loss": 0.5828,
+      "step": 27311
+    },
+    {
+      "epoch": 0.39481892219719283,
+      "grad_norm": 0.984302282333374,
+      "learning_rate": 1.4651512147092482e-06,
+      "loss": 0.5824,
+      "step": 27342
+    },
+    {
+      "epoch": 0.3952665626985502,
+      "grad_norm": 0.9869680404663086,
+      "learning_rate": 1.4369560277232908e-06,
+      "loss": 0.5913,
+      "step": 27373
+    },
+    {
+      "epoch": 0.3957142031999076,
+      "grad_norm": 0.8444851040840149,
+      "learning_rate": 1.409026746485978e-06,
+      "loss": 0.5821,
+      "step": 27404
+    },
+    {
+      "epoch": 0.39616184370126495,
+      "grad_norm": 0.776551365852356,
+      "learning_rate": 1.3813636861754464e-06,
+      "loss": 0.5842,
+      "step": 27435
+    },
+    {
+      "epoch": 0.3966094842026223,
+      "grad_norm": 0.8994253277778625,
+      "learning_rate": 1.3539671589655773e-06,
+      "loss": 0.5852,
+      "step": 27466
+    },
+    {
+      "epoch": 0.3970571247039797,
+      "grad_norm": 0.9431787729263306,
+      "learning_rate": 1.3268374740224548e-06,
+      "loss": 0.5814,
+      "step": 27497
+    },
+    {
+      "epoch": 0.397504765205337,
+      "grad_norm": 0.8683516383171082,
+      "learning_rate": 1.2999749375008807e-06,
+      "loss": 0.584,
+      "step": 27528
+    },
+    {
+      "epoch": 0.3979524057066944,
+      "grad_norm": 0.8702425956726074,
+      "learning_rate": 1.2733798525409346e-06,
+      "loss": 0.5824,
+      "step": 27559
+    },
+    {
+      "epoch": 0.39840004620805175,
+      "grad_norm": 0.9975656270980835,
+      "learning_rate": 1.2470525192645383e-06,
+      "loss": 0.5791,
+      "step": 27590
+    },
+    {
+      "epoch": 0.3988476867094091,
+      "grad_norm": 0.8925862312316895,
+      "learning_rate": 1.2209932347720666e-06,
+      "loss": 0.5793,
+      "step": 27621
+    },
+    {
+      "epoch": 0.3992953272107665,
+      "grad_norm": 0.8167952299118042,
+      "learning_rate": 1.1952022931389972e-06,
+      "loss": 0.5788,
+      "step": 27652
+    },
+    {
+      "epoch": 0.39974296771212386,
+      "grad_norm": 0.8877468705177307,
+      "learning_rate": 1.1696799854126083e-06,
+      "loss": 0.5879,
+      "step": 27683
+    },
+    {
+      "epoch": 0.4001906082134812,
+      "grad_norm": 0.919375479221344,
+      "learning_rate": 1.1444265996086694e-06,
+      "loss": 0.5895,
+      "step": 27714
+    },
+    {
+      "epoch": 0.40063824871483855,
+      "grad_norm": 0.9435774087905884,
+      "learning_rate": 1.119442420708211e-06,
+      "loss": 0.5801,
+      "step": 27745
+    },
+    {
+      "epoch": 0.4010858892161959,
+      "grad_norm": 0.9303644299507141,
+      "learning_rate": 1.0947277306542964e-06,
+      "loss": 0.5887,
+      "step": 27776
+    },
+    {
+      "epoch": 0.4015335297175533,
+      "grad_norm": 0.9209316968917847,
+      "learning_rate": 1.0702828083488353e-06,
+      "loss": 0.5908,
+      "step": 27807
+    },
+    {
+      "epoch": 0.40198117021891067,
+      "grad_norm": 0.8827571272850037,
+      "learning_rate": 1.0461079296494647e-06,
+      "loss": 0.5846,
+      "step": 27838
+    },
+    {
+      "epoch": 0.402428810720268,
+      "grad_norm": 0.908743143081665,
+      "learning_rate": 1.0222033673663978e-06,
+      "loss": 0.5818,
+      "step": 27869
+    },
+    {
+      "epoch": 0.40287645122162535,
+      "grad_norm": 0.9340828061103821,
+      "learning_rate": 9.985693912593713e-07,
+      "loss": 0.5896,
+      "step": 27900
+    },
+    {
+      "epoch": 0.4033240917229827,
+      "grad_norm": 0.9275760054588318,
+      "learning_rate": 9.752062680346035e-07,
+      "loss": 0.5886,
+      "step": 27931
+    },
+    {
+      "epoch": 0.4037717322243401,
+      "grad_norm": 0.9111776947975159,
+      "learning_rate": 9.521142613417494e-07,
+      "loss": 0.5852,
+      "step": 27962
+    },
+    {
+      "epoch": 0.40421937272569747,
+      "grad_norm": 0.9355432987213135,
+      "learning_rate": 9.292936317709722e-07,
+      "loss": 0.5808,
+      "step": 27993
+    },
+    {
+      "epoch": 0.40466701322705484,
+      "grad_norm": 0.8610454201698303,
+      "learning_rate": 9.067446368499793e-07,
+      "loss": 0.5839,
+      "step": 28024
+    },
+    {
+      "epoch": 0.40511465372841216,
+      "grad_norm": 0.9698997735977173,
+      "learning_rate": 8.844675310411055e-07,
+      "loss": 0.5846,
+      "step": 28055
+    },
+    {
+      "epoch": 0.4055622942297695,
+      "grad_norm": 0.864654004573822,
+      "learning_rate": 8.6246256573847e-07,
+      "loss": 0.5768,
+      "step": 28086
+    },
+    {
+      "epoch": 0.4060099347311269,
+      "grad_norm": 0.9228002429008484,
+      "learning_rate": 8.407299892651127e-07,
+      "loss": 0.5801,
+      "step": 28117
+    },
+    {
+      "epoch": 0.40645757523248427,
+      "grad_norm": 0.9443821907043457,
+      "learning_rate": 8.19270046870202e-07,
+      "loss": 0.5826,
+      "step": 28148
+    },
+    {
+      "epoch": 0.40690521573384164,
+      "grad_norm": 0.8702989816665649,
+      "learning_rate": 7.980829807262752e-07,
+      "loss": 0.5804,
+      "step": 28179
+    },
+    {
+      "epoch": 0.40735285623519896,
+      "grad_norm": 0.922178328037262,
+      "learning_rate": 7.771690299264889e-07,
+      "loss": 0.5764,
+      "step": 28210
+    },
+    {
+      "epoch": 0.40780049673655633,
+      "grad_norm": 0.9058007001876831,
+      "learning_rate": 7.565284304819426e-07,
+      "loss": 0.5868,
+      "step": 28241
+    },
+    {
+      "epoch": 0.4082481372379137,
+      "grad_norm": 0.900319516658783,
+      "learning_rate": 7.361614153189922e-07,
+      "loss": 0.5792,
+      "step": 28272
+    },
+    {
+      "epoch": 0.40869577773927107,
+      "grad_norm": 0.9342830777168274,
+      "learning_rate": 7.160682142766328e-07,
+      "loss": 0.5865,
+      "step": 28303
+    },
+    {
+      "epoch": 0.40914341824062844,
+      "grad_norm": 0.9849163293838501,
+      "learning_rate": 6.962490541039091e-07,
+      "loss": 0.5864,
+      "step": 28334
+    },
+    {
+      "epoch": 0.4095910587419858,
+      "grad_norm": 0.9586670398712158,
+      "learning_rate": 6.767041584573531e-07,
+      "loss": 0.5846,
+      "step": 28365
+    },
+    {
+      "epoch": 0.41003869924334313,
+      "grad_norm": 0.8707197308540344,
+      "learning_rate": 6.574337478984532e-07,
+      "loss": 0.5863,
+      "step": 28396
+    },
+    {
+      "epoch": 0.4104863397447005,
+      "grad_norm": 0.861381471157074,
+      "learning_rate": 6.384380398911732e-07,
+      "loss": 0.5829,
+      "step": 28427
+    },
+    {
+      "epoch": 0.4109339802460579,
+      "grad_norm": 0.8965010643005371,
+      "learning_rate": 6.197172487994951e-07,
+      "loss": 0.5804,
+      "step": 28458
+    },
+    {
+      "epoch": 0.41138162074741524,
+      "grad_norm": 0.8720385432243347,
+      "learning_rate": 6.012715858850021e-07,
+      "loss": 0.5824,
+      "step": 28489
+    },
+    {
+      "epoch": 0.4118292612487726,
+      "grad_norm": 0.9194157123565674,
+      "learning_rate": 5.831012593044971e-07,
+      "loss": 0.5829,
+      "step": 28520
+    },
+    {
+      "epoch": 0.41227690175013,
+      "grad_norm": 0.9252064228057861,
+      "learning_rate": 5.652064741076435e-07,
+      "loss": 0.5994,
+      "step": 28551
+    },
+    {
+      "epoch": 0.4127245422514873,
+      "grad_norm": 0.8969072103500366,
+      "learning_rate": 5.475874322346558e-07,
+      "loss": 0.5831,
+      "step": 28582
+    },
+    {
+      "epoch": 0.4131721827528447,
+      "grad_norm": 0.8892102241516113,
+      "learning_rate": 5.30244332514035e-07,
+      "loss": 0.5832,
+      "step": 28613
+    },
+    {
+      "epoch": 0.41361982325420205,
+      "grad_norm": 0.8727803826332092,
+      "learning_rate": 5.131773706602977e-07,
+      "loss": 0.5826,
+      "step": 28644
+    },
+    {
+      "epoch": 0.4140674637555594,
+      "grad_norm": 0.909462034702301,
+      "learning_rate": 4.963867392717897e-07,
+      "loss": 0.5776,
+      "step": 28675
+    },
+    {
+      "epoch": 0.4145151042569168,
+      "grad_norm": 0.9155588150024414,
+      "learning_rate": 4.798726278285093e-07,
+      "loss": 0.5869,
+      "step": 28706
+    },
+    {
+      "epoch": 0.4149627447582741,
+      "grad_norm": 0.8983290195465088,
+      "learning_rate": 4.6363522268995097e-07,
+      "loss": 0.5829,
+      "step": 28737
+    },
+    {
+      "epoch": 0.4154103852596315,
+      "grad_norm": 0.8464504480361938,
+      "learning_rate": 4.4767470709302927e-07,
+      "loss": 0.5724,
+      "step": 28768
+    },
+    {
+      "epoch": 0.41585802576098885,
+      "grad_norm": 0.8773616552352905,
+      "learning_rate": 4.319912611499971e-07,
+      "loss": 0.5818,
+      "step": 28799
+    },
+    {
+      "epoch": 0.4163056662623462,
+      "grad_norm": 0.9296445250511169,
+      "learning_rate": 4.1658506184640564e-07,
+      "loss": 0.5771,
+      "step": 28830
+    },
+    {
+      "epoch": 0.4167533067637036,
+      "grad_norm": 0.9054000973701477,
+      "learning_rate": 4.0145628303911996e-07,
+      "loss": 0.5854,
+      "step": 28861
+    },
+    {
+      "epoch": 0.41720094726506096,
+      "grad_norm": 0.8678483366966248,
+      "learning_rate": 3.866050954543565e-07,
+      "loss": 0.5802,
+      "step": 28892
+    },
+    {
+      "epoch": 0.4176485877664183,
+      "grad_norm": 0.8449427485466003,
+      "learning_rate": 3.720316666857432e-07,
+      "loss": 0.5808,
+      "step": 28923
+    },
+    {
+      "epoch": 0.41809622826777565,
+      "grad_norm": 1.0177295207977295,
+      "learning_rate": 3.5773616119244845e-07,
+      "loss": 0.58,
+      "step": 28954
+    },
+    {
+      "epoch": 0.418543868769133,
+      "grad_norm": 0.9004929065704346,
+      "learning_rate": 3.437187402973052e-07,
+      "loss": 0.5885,
+      "step": 28985
+    },
+    {
+      "epoch": 0.4189915092704904,
+      "grad_norm": 0.8589534163475037,
+      "learning_rate": 3.2997956218500104e-07,
+      "loss": 0.577,
+      "step": 29016
+    },
+    {
+      "epoch": 0.41943914977184776,
+      "grad_norm": 0.8590395450592041,
+      "learning_rate": 3.165187819003018e-07,
+      "loss": 0.583,
+      "step": 29047
+    },
+    {
+      "epoch": 0.4198867902732051,
+      "grad_norm": 0.8661704063415527,
+      "learning_rate": 3.033365513462755e-07,
+      "loss": 0.5921,
+      "step": 29078
+    },
+    {
+      "epoch": 0.42033443077456245,
+      "grad_norm": 0.9449494481086731,
+      "learning_rate": 2.9043301928260437e-07,
+      "loss": 0.5846,
+      "step": 29109
+    },
+    {
+      "epoch": 0.4207820712759198,
+      "grad_norm": 0.8835760951042175,
+      "learning_rate": 2.7780833132389773e-07,
+      "loss": 0.5841,
+      "step": 29140
+    },
+    {
+      "epoch": 0.4212297117772772,
+      "grad_norm": 0.8761197328567505,
+      "learning_rate": 2.6546262993803473e-07,
+      "loss": 0.5855,
+      "step": 29171
+    },
+    {
+      "epoch": 0.42167735227863457,
+      "grad_norm": 0.9937579035758972,
+      "learning_rate": 2.533960544445879e-07,
+      "loss": 0.58,
+      "step": 29202
+    },
+    {
+      "epoch": 0.42212499277999194,
+      "grad_norm": 0.830741822719574,
+      "learning_rate": 2.416087410132134e-07,
+      "loss": 0.5828,
+      "step": 29233
+    },
+    {
+      "epoch": 0.42257263328134925,
+      "grad_norm": 0.9345104098320007,
+      "learning_rate": 2.301008226621465e-07,
+      "loss": 0.5838,
+      "step": 29264
+    },
+    {
+      "epoch": 0.4230202737827066,
+      "grad_norm": 0.8927823305130005,
+      "learning_rate": 2.1887242925668073e-07,
+      "loss": 0.5823,
+      "step": 29295
+    },
+    {
+      "epoch": 0.423467914284064,
+      "grad_norm": 0.9171820282936096,
+      "learning_rate": 2.0792368750770785e-07,
+      "loss": 0.5884,
+      "step": 29326
+    },
+    {
+      "epoch": 0.42391555478542137,
+      "grad_norm": 0.9589295387268066,
+      "learning_rate": 1.9725472097028851e-07,
+      "loss": 0.5855,
+      "step": 29357
+    },
+    {
+      "epoch": 0.42436319528677874,
+      "grad_norm": 0.9396247267723083,
+      "learning_rate": 1.8686565004226718e-07,
+      "loss": 0.5898,
+      "step": 29388
+    },
+    {
+      "epoch": 0.4248108357881361,
+      "grad_norm": 0.8725122809410095,
+      "learning_rate": 1.7675659196288995e-07,
+      "loss": 0.5878,
+      "step": 29419
+    },
+    {
+      "epoch": 0.4252584762894934,
+      "grad_norm": 0.8159589767456055,
+      "learning_rate": 1.6692766081150556e-07,
+      "loss": 0.5793,
+      "step": 29450
+    },
+    {
+      "epoch": 0.4257061167908508,
+      "grad_norm": 0.9211475253105164,
+      "learning_rate": 1.5737896750626647e-07,
+      "loss": 0.5823,
+      "step": 29481
+    },
+    {
+      "epoch": 0.42615375729220817,
+      "grad_norm": 0.8058143258094788,
+      "learning_rate": 1.4811061980287976e-07,
+      "loss": 0.5828,
+      "step": 29512
+    },
+    {
+      "epoch": 0.42660139779356554,
+      "grad_norm": 1.0279886722564697,
+      "learning_rate": 1.3912272229338886e-07,
+      "loss": 0.5832,
+      "step": 29543
+    },
+    {
+      "epoch": 0.4270490382949229,
+      "grad_norm": 0.9076089262962341,
+      "learning_rate": 1.3041537640499645e-07,
+      "loss": 0.5802,
+      "step": 29574
+    },
+    {
+      "epoch": 0.4274966787962802,
+      "grad_norm": 0.9607642889022827,
+      "learning_rate": 1.2198868039891564e-07,
+      "loss": 0.5901,
+      "step": 29605
+    },
+    {
+      "epoch": 0.4279443192976376,
+      "grad_norm": 0.9149357080459595,
+      "learning_rate": 1.138427293692651e-07,
+      "loss": 0.5801,
+      "step": 29636
+    },
+    {
+      "epoch": 0.42839195979899497,
+      "grad_norm": 0.8789470791816711,
+      "learning_rate": 1.0597761524199778e-07,
+      "loss": 0.5876,
+      "step": 29667
+    },
+    {
+      "epoch": 0.42883960030035234,
+      "grad_norm": 0.8680891394615173,
+      "learning_rate": 9.839342677385455e-08,
+      "loss": 0.5871,
+      "step": 29698
+    },
+    {
+      "epoch": 0.4292872408017097,
+      "grad_norm": 0.9469189643859863,
+      "learning_rate": 9.109024955137325e-08,
+      "loss": 0.5806,
+      "step": 29729
+    },
+    {
+      "epoch": 0.4297348813030671,
+      "grad_norm": 0.8555542230606079,
+      "learning_rate": 8.406816598991729e-08,
+      "loss": 0.5878,
+      "step": 29760
+    },
+    {
+      "epoch": 0.4301825218044244,
+      "grad_norm": 0.8864733576774597,
+      "learning_rate": 7.73272553327431e-08,
+      "loss": 0.5814,
+      "step": 29791
+    },
+    {
+      "epoch": 0.43063016230578177,
+      "grad_norm": 0.9179209470748901,
+      "learning_rate": 7.086759365011186e-08,
+      "loss": 0.5842,
+      "step": 29822
+    },
+    {
+      "epoch": 0.43107780280713914,
+      "grad_norm": 0.952071487903595,
+      "learning_rate": 6.468925383842639e-08,
+      "loss": 0.5806,
+      "step": 29853
+    },
+    {
+      "epoch": 0.4315254433084965,
+      "grad_norm": 0.9271907806396484,
+      "learning_rate": 5.8792305619415067e-08,
+      "loss": 0.5857,
+      "step": 29884
+    },
+    {
+      "epoch": 0.4319730838098539,
+      "grad_norm": 0.9237009286880493,
+      "learning_rate": 5.317681553933529e-08,
+      "loss": 0.5714,
+      "step": 29915
+    },
+    {
+      "epoch": 0.4324207243112112,
+      "grad_norm": 0.9051603078842163,
+      "learning_rate": 4.78428469682296e-08,
+      "loss": 0.5876,
+      "step": 29946
+    },
+    {
+      "epoch": 0.4328683648125686,
+      "grad_norm": 0.8672299385070801,
+      "learning_rate": 4.2790460099206844e-08,
+      "loss": 0.5837,
+      "step": 29977
+    },
+    {
+      "epoch": 0.43331600531392594,
+      "grad_norm": 0.9098496437072754,
+      "learning_rate": 3.801971194777043e-08,
+      "loss": 0.5865,
+      "step": 30008
+    },
+    {
+      "epoch": 0.4337636458152833,
+      "grad_norm": 0.838880717754364,
+      "learning_rate": 3.353065635115782e-08,
+      "loss": 0.589,
+      "step": 30039
+    },
+    {
+      "epoch": 0.4342112863166407,
+      "grad_norm": 0.8752391338348389,
+      "learning_rate": 2.93233439677576e-08,
+      "loss": 0.5921,
+      "step": 30070
+    },
+    {
+      "epoch": 0.43465892681799806,
+      "grad_norm": 0.8784201741218567,
+      "learning_rate": 2.539782227651555e-08,
+      "loss": 0.5842,
+      "step": 30101
+    },
+    {
+      "epoch": 0.4351065673193554,
+      "grad_norm": 0.8383466601371765,
+      "learning_rate": 2.175413557641004e-08,
+      "loss": 0.58,
+      "step": 30132
+    },
+    {
+      "epoch": 0.43555420782071275,
+      "grad_norm": 0.8927497267723083,
+      "learning_rate": 1.839232498594967e-08,
+      "loss": 0.5817,
+      "step": 30163
+    },
+    {
+      "epoch": 0.4360018483220701,
+      "grad_norm": 0.8814988136291504,
+      "learning_rate": 1.5312428442712522e-08,
+      "loss": 0.5815,
+      "step": 30194
+    },
+    {
+      "epoch": 0.4364494888234275,
+      "grad_norm": 0.8772262930870056,
+      "learning_rate": 1.2514480702913168e-08,
+      "loss": 0.5828,
+      "step": 30225
+    },
+    {
+      "epoch": 0.43689712932478486,
+      "grad_norm": 0.9208593368530273,
+      "learning_rate": 9.998513341005766e-09,
+      "loss": 0.5832,
+      "step": 30256
+    },
+    {
+      "epoch": 0.43734476982614223,
+      "grad_norm": 0.9014251828193665,
+      "learning_rate": 7.764554749345454e-09,
+      "loss": 0.5859,
+      "step": 30287
+    },
+    {
+      "epoch": 0.43779241032749955,
+      "grad_norm": 0.8643161654472351,
+      "learning_rate": 5.812630137849717e-09,
+      "loss": 0.5818,
+      "step": 30318
+    },
+    {
+      "epoch": 0.4382400508288569,
+      "grad_norm": 0.9332028031349182,
+      "learning_rate": 4.142761533723616e-09,
+      "loss": 0.5824,
+      "step": 30349
+    },
+    {
+      "epoch": 0.4386876913302143,
+      "grad_norm": 0.937886118888855,
+      "learning_rate": 2.7549677812044317e-09,
+      "loss": 0.5827,
+      "step": 30380
+    },
+    {
+      "epoch": 0.43913533183157166,
+      "grad_norm": 0.8592664003372192,
+      "learning_rate": 1.6492645413590525e-09,
+      "loss": 0.5874,
+      "step": 30411
+    },
+    {
+      "epoch": 0.43958297233292903,
+      "grad_norm": 0.9078693985939026,
+      "learning_rate": 8.256642918980096e-10,
+      "loss": 0.5887,
+      "step": 30442
+    },
+    {
+      "epoch": 0.44003061283428635,
+      "grad_norm": 0.9738094210624695,
+      "learning_rate": 2.841763270367004e-10,
+      "loss": 0.5885,
+      "step": 30473
+    },
+    {
+      "epoch": 0.4404782533356437,
+      "grad_norm": 0.8958234786987305,
+      "learning_rate": 2.480675739269245e-11,
+      "loss": 0.588,
+      "step": 30504
+    }
+  ],
+  "logging_steps": 31,
+  "max_steps": 30517,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 7630,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.251434749612104e+19,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-30517/training_args.bin b/checkpoint-30517/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542
--- /dev/null
+++ b/checkpoint-30517/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3
+size 5432
diff --git a/checkpoint-7630/config.json b/checkpoint-7630/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09
--- /dev/null
+++ b/checkpoint-7630/config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "meta-llama/Llama-3.1-8B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/checkpoint-7630/generation_config.json b/checkpoint-7630/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507
--- /dev/null
+++ b/checkpoint-7630/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.47.0.dev0"
+}
diff --git a/checkpoint-7630/model-00001-of-00007.safetensors b/checkpoint-7630/model-00001-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ce7f182e82e329320bcb36b04cf7c74d588805c1
--- /dev/null
+++ b/checkpoint-7630/model-00001-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1304ccb28c7baf2e11d4151ac4ce39ec5e74ac35fe1bb64ae0c3a4fc5feae30
+size 4886466168
diff --git a/checkpoint-7630/model-00002-of-00007.safetensors b/checkpoint-7630/model-00002-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961
--- /dev/null
+++ b/checkpoint-7630/model-00002-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64
+size 4832007448
diff --git a/checkpoint-7630/model-00003-of-00007.safetensors b/checkpoint-7630/model-00003-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff
--- /dev/null
+++ b/checkpoint-7630/model-00003-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97
+size 4999813112
diff --git a/checkpoint-7630/model-00004-of-00007.safetensors b/checkpoint-7630/model-00004-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a
--- /dev/null
+++ b/checkpoint-7630/model-00004-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042
+size 4999813128
diff --git a/checkpoint-7630/model-00005-of-00007.safetensors b/checkpoint-7630/model-00005-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89
--- /dev/null
+++ b/checkpoint-7630/model-00005-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7
+size 4832007496
diff --git a/checkpoint-7630/model-00006-of-00007.safetensors b/checkpoint-7630/model-00006-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6c02f856c6d61d591503cb8f20e2f86b84e85cbd
--- /dev/null
+++ b/checkpoint-7630/model-00006-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ca1ae72980a5b1170f14c77aa114b6676a47644ca1ee08bb557eada38076fd1
+size 4999813120
diff --git a/checkpoint-7630/model-00007-of-00007.safetensors b/checkpoint-7630/model-00007-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5af7b9622701e0303b6aa879010313980169acc3
--- /dev/null
+++ b/checkpoint-7630/model-00007-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2478f25f3b8db07780b95525b898a1239e19e98136f2aede1f221e3f596bb730
+size 2571158184
diff --git a/checkpoint-7630/model.safetensors.index.json b/checkpoint-7630/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13
--- /dev/null
+++ b/checkpoint-7630/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 32121044992
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.norm.weight": "model-00007-of-00007.safetensors"
+  }
+}
diff --git a/checkpoint-7630/optimizer.pt b/checkpoint-7630/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81e8e7b7562be962bdc3fe6511d7c45a77a7d64d
--- /dev/null
+++ b/checkpoint-7630/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e443304a650a93fae8c01bee8207cb57a93b585bccc7159711506c1f7eae0751
+size 15385036334
diff --git a/checkpoint-7630/rng_state.pth b/checkpoint-7630/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..93a52228ea16468eaab41b66555ced3eaaea002c
--- /dev/null
+++ b/checkpoint-7630/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
+size 14244
diff --git a/checkpoint-7630/scheduler.pt b/checkpoint-7630/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0694ebf59305ba0ae1a85e2530a75a5934817c99
--- /dev/null
+++ b/checkpoint-7630/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:534e20fd2d9d5b7f7b2cacb37e2f2890c1ef6152d939158d158e395d1f32997e
+size 1064
diff --git a/checkpoint-7630/trainer_state.json b/checkpoint-7630/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c496fd173565158d577ec420f3d8d947ee3f8357
--- /dev/null
+++ b/checkpoint-7630/trainer_state.json
@@ -0,0 +1,1755 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.1101773233986022,
+  "eval_steps": 500,
+  "global_step": 7630,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0004476405013573615,
+      "grad_norm": 4.6696085929870605,
+      "learning_rate": 1.0157273918741808e-06,
+      "loss": 0.9366,
+      "step": 31
+    },
+    {
+      "epoch": 0.000895281002714723,
+      "grad_norm": 4.250915050506592,
+      "learning_rate": 2.0314547837483616e-06,
+      "loss": 0.9002,
+      "step": 62
+    },
+    {
+      "epoch": 0.0013429215040720846,
+      "grad_norm": 4.424270153045654,
+      "learning_rate": 3.0471821756225426e-06,
+      "loss": 0.8843,
+      "step": 93
+    },
+    {
+      "epoch": 0.001790562005429446,
+      "grad_norm": 4.56964635848999,
+      "learning_rate": 4.062909567496723e-06,
+      "loss": 0.8717,
+      "step": 124
+    },
+    {
+      "epoch": 0.0022382025067868077,
+      "grad_norm": 4.051624298095703,
+      "learning_rate": 5.078636959370905e-06,
+      "loss": 0.8711,
+      "step": 155
+    },
+    {
+      "epoch": 0.002685843008144169,
+      "grad_norm": 3.98006272315979,
+      "learning_rate": 6.094364351245085e-06,
+      "loss": 0.8628,
+      "step": 186
+    },
+    {
+      "epoch": 0.0031334835095015307,
+      "grad_norm": 4.4158406257629395,
+      "learning_rate": 7.110091743119267e-06,
+      "loss": 0.871,
+      "step": 217
+    },
+    {
+      "epoch": 0.003581124010858892,
+      "grad_norm": 4.681333541870117,
+      "learning_rate": 8.125819134993446e-06,
+      "loss": 0.8593,
+      "step": 248
+    },
+    {
+      "epoch": 0.004028764512216254,
+      "grad_norm": 3.8057820796966553,
+      "learning_rate": 9.141546526867629e-06,
+      "loss": 0.8558,
+      "step": 279
+    },
+    {
+      "epoch": 0.0044764050135736155,
+      "grad_norm": 4.523633003234863,
+      "learning_rate": 1.015727391874181e-05,
+      "loss": 0.8676,
+      "step": 310
+    },
+    {
+      "epoch": 0.0049240455149309765,
+      "grad_norm": 3.7387187480926514,
+      "learning_rate": 1.117300131061599e-05,
+      "loss": 0.8585,
+      "step": 341
+    },
+    {
+      "epoch": 0.005371686016288338,
+      "grad_norm": 4.187750816345215,
+      "learning_rate": 1.218872870249017e-05,
+      "loss": 0.8592,
+      "step": 372
+    },
+    {
+      "epoch": 0.005819326517645699,
+      "grad_norm": 3.782883644104004,
+      "learning_rate": 1.3204456094364351e-05,
+      "loss": 0.8449,
+      "step": 403
+    },
+    {
+      "epoch": 0.006266967019003061,
+      "grad_norm": 3.577796459197998,
+      "learning_rate": 1.4220183486238533e-05,
+      "loss": 0.8418,
+      "step": 434
+    },
+    {
+      "epoch": 0.006714607520360423,
+      "grad_norm": 3.1408321857452393,
+      "learning_rate": 1.5235910878112714e-05,
+      "loss": 0.8577,
+      "step": 465
+    },
+    {
+      "epoch": 0.007162248021717784,
+      "grad_norm": 4.090081691741943,
+      "learning_rate": 1.6251638269986893e-05,
+      "loss": 0.8439,
+      "step": 496
+    },
+    {
+      "epoch": 0.007609888523075146,
+      "grad_norm": 2.7458200454711914,
+      "learning_rate": 1.7267365661861077e-05,
+      "loss": 0.8468,
+      "step": 527
+    },
+    {
+      "epoch": 0.008057529024432507,
+      "grad_norm": 3.703225612640381,
+      "learning_rate": 1.8283093053735257e-05,
+      "loss": 0.8385,
+      "step": 558
+    },
+    {
+      "epoch": 0.008505169525789868,
+      "grad_norm": 3.134650230407715,
+      "learning_rate": 1.9298820445609438e-05,
+      "loss": 0.8418,
+      "step": 589
+    },
+    {
+      "epoch": 0.008952810027147231,
+      "grad_norm": 3.762680768966675,
+      "learning_rate": 2.031454783748362e-05,
+      "loss": 0.8312,
+      "step": 620
+    },
+    {
+      "epoch": 0.009400450528504592,
+      "grad_norm": 3.751004457473755,
+      "learning_rate": 2.13302752293578e-05,
+      "loss": 0.8251,
+      "step": 651
+    },
+    {
+      "epoch": 0.009848091029861953,
+      "grad_norm": 3.2268712520599365,
+      "learning_rate": 2.234600262123198e-05,
+      "loss": 0.8369,
+      "step": 682
+    },
+    {
+      "epoch": 0.010295731531219316,
+      "grad_norm": 3.5854289531707764,
+      "learning_rate": 2.336173001310616e-05,
+      "loss": 0.826,
+      "step": 713
+    },
+    {
+      "epoch": 0.010743372032576677,
+      "grad_norm": 3.9910435676574707,
+      "learning_rate": 2.437745740498034e-05,
+      "loss": 0.8168,
+      "step": 744
+    },
+    {
+      "epoch": 0.011191012533934038,
+      "grad_norm": 3.3059303760528564,
+      "learning_rate": 2.5393184796854525e-05,
+      "loss": 0.8269,
+      "step": 775
+    },
+    {
+      "epoch": 0.011638653035291399,
+      "grad_norm": 3.4081811904907227,
+      "learning_rate": 2.6408912188728702e-05,
+      "loss": 0.817,
+      "step": 806
+    },
+    {
+      "epoch": 0.012086293536648762,
+      "grad_norm": 3.2740163803100586,
+      "learning_rate": 2.7424639580602886e-05,
+      "loss": 0.8195,
+      "step": 837
+    },
+    {
+      "epoch": 0.012533934038006123,
+      "grad_norm": 2.7206223011016846,
+      "learning_rate": 2.8440366972477066e-05,
+      "loss": 0.8188,
+      "step": 868
+    },
+    {
+      "epoch": 0.012981574539363484,
+      "grad_norm": 2.7005629539489746,
+      "learning_rate": 2.9456094364351244e-05,
+      "loss": 0.8127,
+      "step": 899
+    },
+    {
+      "epoch": 0.013429215040720846,
+      "grad_norm": 2.970745325088501,
+      "learning_rate": 3.0471821756225428e-05,
+      "loss": 0.8126,
+      "step": 930
+    },
+    {
+      "epoch": 0.013876855542078207,
+      "grad_norm": 2.4761953353881836,
+      "learning_rate": 3.148754914809961e-05,
+      "loss": 0.82,
+      "step": 961
+    },
+    {
+      "epoch": 0.014324496043435568,
+      "grad_norm": 2.8555397987365723,
+      "learning_rate": 3.2503276539973785e-05,
+      "loss": 0.8166,
+      "step": 992
+    },
+    {
+      "epoch": 0.01477213654479293,
+      "grad_norm": 2.8124194145202637,
+      "learning_rate": 3.351900393184797e-05,
+      "loss": 0.8057,
+      "step": 1023
+    },
+    {
+      "epoch": 0.015219777046150292,
+      "grad_norm": 2.353851556777954,
+      "learning_rate": 3.453473132372215e-05,
+      "loss": 0.8064,
+      "step": 1054
+    },
+    {
+      "epoch": 0.015667417547507653,
+      "grad_norm": 3.0127620697021484,
+      "learning_rate": 3.555045871559633e-05,
+      "loss": 0.8086,
+      "step": 1085
+    },
+    {
+      "epoch": 0.016115058048865014,
+      "grad_norm": 2.792686939239502,
+      "learning_rate": 3.6566186107470514e-05,
+      "loss": 0.8152,
+      "step": 1116
+    },
+    {
+      "epoch": 0.016562698550222375,
+      "grad_norm": 2.407134532928467,
+      "learning_rate": 3.7581913499344695e-05,
+      "loss": 0.7949,
+      "step": 1147
+    },
+    {
+      "epoch": 0.017010339051579736,
+      "grad_norm": 2.6921393871307373,
+      "learning_rate": 3.8597640891218876e-05,
+      "loss": 0.804,
+      "step": 1178
+    },
+    {
+      "epoch": 0.0174579795529371,
+      "grad_norm": 2.3015975952148438,
+      "learning_rate": 3.9613368283093056e-05,
+      "loss": 0.7944,
+      "step": 1209
+    },
+    {
+      "epoch": 0.017905620054294462,
+      "grad_norm": 2.8116579055786133,
+      "learning_rate": 4.062909567496724e-05,
+      "loss": 0.7977,
+      "step": 1240
+    },
+    {
+      "epoch": 0.018353260555651823,
+      "grad_norm": 2.5720036029815674,
+      "learning_rate": 4.164482306684142e-05,
+      "loss": 0.7854,
+      "step": 1271
+    },
+    {
+      "epoch": 0.018800901057009184,
+      "grad_norm": 2.0802650451660156,
+      "learning_rate": 4.26605504587156e-05,
+      "loss": 0.7892,
+      "step": 1302
+    },
+    {
+      "epoch": 0.019248541558366545,
+      "grad_norm": 2.4343624114990234,
+      "learning_rate": 4.367627785058978e-05,
+      "loss": 0.7897,
+      "step": 1333
+    },
+    {
+      "epoch": 0.019696182059723906,
+      "grad_norm": 2.509686231613159,
+      "learning_rate": 4.469200524246396e-05,
+      "loss": 0.7855,
+      "step": 1364
+    },
+    {
+      "epoch": 0.020143822561081267,
+      "grad_norm": 2.626512289047241,
+      "learning_rate": 4.570773263433814e-05,
+      "loss": 0.7873,
+      "step": 1395
+    },
+    {
+      "epoch": 0.02059146306243863,
+      "grad_norm": 2.8619399070739746,
+      "learning_rate": 4.672346002621232e-05,
+      "loss": 0.7891,
+      "step": 1426
+    },
+    {
+      "epoch": 0.021039103563795993,
+      "grad_norm": 2.724792718887329,
+      "learning_rate": 4.77391874180865e-05,
+      "loss": 0.782,
+      "step": 1457
+    },
+    {
+      "epoch": 0.021486744065153354,
+      "grad_norm": 2.6659562587738037,
+      "learning_rate": 4.875491480996068e-05,
+      "loss": 0.7856,
+      "step": 1488
+    },
+    {
+      "epoch": 0.021934384566510715,
+      "grad_norm": 2.646078586578369,
+      "learning_rate": 4.977064220183487e-05,
+      "loss": 0.7748,
+      "step": 1519
+    },
+    {
+      "epoch": 0.022382025067868076,
+      "grad_norm": 2.429288387298584,
+      "learning_rate": 4.9999915451558777e-05,
+      "loss": 0.7722,
+      "step": 1550
+    },
+    {
+      "epoch": 0.022829665569225437,
+      "grad_norm": 1.9933409690856934,
+      "learning_rate": 4.999955597496219e-05,
+      "loss": 0.7874,
+      "step": 1581
+    },
+    {
+      "epoch": 0.023277306070582798,
+      "grad_norm": 2.314889907836914,
+      "learning_rate": 4.9998914381774255e-05,
+      "loss": 0.7757,
+      "step": 1612
+    },
+    {
+      "epoch": 0.023724946571940162,
+      "grad_norm": 2.2891199588775635,
+      "learning_rate": 4.999799067923527e-05,
+      "loss": 0.7713,
+      "step": 1643
+    },
+    {
+      "epoch": 0.024172587073297523,
+      "grad_norm": 2.4892444610595703,
+      "learning_rate": 4.999678487776908e-05,
+      "loss": 0.7687,
+      "step": 1674
+    },
+    {
+      "epoch": 0.024620227574654884,
+      "grad_norm": 2.3015685081481934,
+      "learning_rate": 4.9995296990983006e-05,
+      "loss": 0.7721,
+      "step": 1705
+    },
+    {
+      "epoch": 0.025067868076012245,
+      "grad_norm": 2.278954029083252,
+      "learning_rate": 4.999352703566763e-05,
+      "loss": 0.7741,
+      "step": 1736
+    },
+    {
+      "epoch": 0.025515508577369606,
+      "grad_norm": 1.7260370254516602,
+      "learning_rate": 4.999147503179668e-05,
+      "loss": 0.7681,
+      "step": 1767
+    },
+    {
+      "epoch": 0.025963149078726967,
+      "grad_norm": 2.0179309844970703,
+      "learning_rate": 4.998914100252672e-05,
+      "loss": 0.7604,
+      "step": 1798
+    },
+    {
+      "epoch": 0.02641078958008433,
+      "grad_norm": 2.53022837638855,
+      "learning_rate": 4.998652497419696e-05,
+      "loss": 0.7598,
+      "step": 1829
+    },
+    {
+      "epoch": 0.026858430081441693,
+      "grad_norm": 1.859253168106079,
+      "learning_rate": 4.9983626976328927e-05,
+      "loss": 0.7606,
+      "step": 1860
+    },
+    {
+      "epoch": 0.027306070582799054,
+      "grad_norm": 1.759303331375122,
+      "learning_rate": 4.998044704162613e-05,
+      "loss": 0.7532,
+      "step": 1891
+    },
+    {
+      "epoch": 0.027753711084156415,
+      "grad_norm": 2.4389419555664062,
+      "learning_rate": 4.9976985205973705e-05,
+      "loss": 0.7646,
+      "step": 1922
+    },
+    {
+      "epoch": 0.028201351585513776,
+      "grad_norm": 2.155348777770996,
+      "learning_rate": 4.997324150843799e-05,
+      "loss": 0.7569,
+      "step": 1953
+    },
+    {
+      "epoch": 0.028648992086871137,
+      "grad_norm": 2.0138537883758545,
+      "learning_rate": 4.99692159912661e-05,
+      "loss": 0.7677,
+      "step": 1984
+    },
+    {
+      "epoch": 0.029096632588228498,
+      "grad_norm": 2.5275282859802246,
+      "learning_rate": 4.996490869988546e-05,
+      "loss": 0.7519,
+      "step": 2015
+    },
+    {
+      "epoch": 0.02954427308958586,
+      "grad_norm": 1.8147333860397339,
+      "learning_rate": 4.996031968290326e-05,
+      "loss": 0.7509,
+      "step": 2046
+    },
+    {
+      "epoch": 0.029991913590943223,
+      "grad_norm": 2.1941769123077393,
+      "learning_rate": 4.995544899210594e-05,
+      "loss": 0.754,
+      "step": 2077
+    },
+    {
+      "epoch": 0.030439554092300584,
+      "grad_norm": 1.8953059911727905,
+      "learning_rate": 4.9950296682458583e-05,
+      "loss": 0.747,
+      "step": 2108
+    },
+    {
+      "epoch": 0.030887194593657945,
+      "grad_norm": 3.3973031044006348,
+      "learning_rate": 4.994486281210429e-05,
+      "loss": 0.7513,
+      "step": 2139
+    },
+    {
+      "epoch": 0.031334835095015307,
+      "grad_norm": 2.66795015335083,
+      "learning_rate": 4.9939147442363566e-05,
+      "loss": 0.7469,
+      "step": 2170
+    },
+    {
+      "epoch": 0.03178247559637267,
+      "grad_norm": 1.6254230737686157,
+      "learning_rate": 4.9933150637733574e-05,
+      "loss": 0.7297,
+      "step": 2201
+    },
+    {
+      "epoch": 0.03223011609773003,
+      "grad_norm": 1.822745680809021,
+      "learning_rate": 4.992687246588743e-05,
+      "loss": 0.754,
+      "step": 2232
+    },
+    {
+      "epoch": 0.03267775659908739,
+      "grad_norm": 1.6898781061172485,
+      "learning_rate": 4.992031299767347e-05,
+      "loss": 0.7478,
+      "step": 2263
+    },
+    {
+      "epoch": 0.03312539710044475,
+      "grad_norm": 1.799280047416687,
+      "learning_rate": 4.9913472307114386e-05,
+      "loss": 0.746,
+      "step": 2294
+    },
+    {
+      "epoch": 0.033573037601802115,
+      "grad_norm": 2.2501840591430664,
+      "learning_rate": 4.9906350471406446e-05,
+      "loss": 0.7408,
+      "step": 2325
+    },
+    {
+      "epoch": 0.03402067810315947,
+      "grad_norm": 2.3315324783325195,
+      "learning_rate": 4.989894757091861e-05,
+      "loss": 0.7301,
+      "step": 2356
+    },
+    {
+      "epoch": 0.03446831860451684,
+      "grad_norm": 1.5820438861846924,
+      "learning_rate": 4.989126368919158e-05,
+      "loss": 0.7305,
+      "step": 2387
+    },
+    {
+      "epoch": 0.0349159591058742,
+      "grad_norm": 2.5696022510528564,
+      "learning_rate": 4.988329891293693e-05,
+      "loss": 0.7337,
+      "step": 2418
+    },
+    {
+      "epoch": 0.03536359960723156,
+      "grad_norm": 1.8880938291549683,
+      "learning_rate": 4.987505333203608e-05,
+      "loss": 0.7385,
+      "step": 2449
+    },
+    {
+      "epoch": 0.035811240108588924,
+      "grad_norm": 2.6148738861083984,
+      "learning_rate": 4.9866527039539276e-05,
+      "loss": 0.7292,
+      "step": 2480
+    },
+    {
+      "epoch": 0.03625888060994628,
+      "grad_norm": 1.6925290822982788,
+      "learning_rate": 4.9857720131664594e-05,
+      "loss": 0.7344,
+      "step": 2511
+    },
+    {
+      "epoch": 0.036706521111303646,
+      "grad_norm": 1.7675210237503052,
+      "learning_rate": 4.9848632707796773e-05,
+      "loss": 0.7354,
+      "step": 2542
+    },
+    {
+      "epoch": 0.037154161612661,
+      "grad_norm": 2.1053173542022705,
+      "learning_rate": 4.9839264870486155e-05,
+      "loss": 0.7272,
+      "step": 2573
+    },
+    {
+      "epoch": 0.03760180211401837,
+      "grad_norm": 1.9718347787857056,
+      "learning_rate": 4.9829616725447526e-05,
+      "loss": 0.7336,
+      "step": 2604
+    },
+    {
+      "epoch": 0.03804944261537573,
+      "grad_norm": 1.5777671337127686,
+      "learning_rate": 4.981968838155888e-05,
+      "loss": 0.7182,
+      "step": 2635
+    },
+    {
+      "epoch": 0.03849708311673309,
+      "grad_norm": 1.905127763748169,
+      "learning_rate": 4.980947995086024e-05,
+      "loss": 0.7296,
+      "step": 2666
+    },
+    {
+      "epoch": 0.038944723618090454,
+      "grad_norm": 1.63962721824646,
+      "learning_rate": 4.979899154855234e-05,
+      "loss": 0.7249,
+      "step": 2697
+    },
+    {
+      "epoch": 0.03939236411944781,
+      "grad_norm": 1.584331750869751,
+      "learning_rate": 4.9788223292995386e-05,
+      "loss": 0.7345,
+      "step": 2728
+    },
+    {
+      "epoch": 0.039840004620805176,
+      "grad_norm": 1.9111014604568481,
+      "learning_rate": 4.977717530570768e-05,
+      "loss": 0.7225,
+      "step": 2759
+    },
+    {
+      "epoch": 0.040287645122162534,
+      "grad_norm": 2.3216073513031006,
+      "learning_rate": 4.976584771136425e-05,
+      "loss": 0.7207,
+      "step": 2790
+    },
+    {
+      "epoch": 0.0407352856235199,
+      "grad_norm": 1.6002410650253296,
+      "learning_rate": 4.975424063779547e-05,
+      "loss": 0.7228,
+      "step": 2821
+    },
+    {
+      "epoch": 0.04118292612487726,
+      "grad_norm": 2.104731798171997,
+      "learning_rate": 4.974235421598557e-05,
+      "loss": 0.7127,
+      "step": 2852
+    },
+    {
+      "epoch": 0.04163056662623462,
+      "grad_norm": 1.7114660739898682,
+      "learning_rate": 4.973018858007122e-05,
+      "loss": 0.7283,
+      "step": 2883
+    },
+    {
+      "epoch": 0.042078207127591985,
+      "grad_norm": 1.948133945465088,
+      "learning_rate": 4.9717743867339963e-05,
+      "loss": 0.7209,
+      "step": 2914
+    },
+    {
+      "epoch": 0.04252584762894934,
+      "grad_norm": 1.621764898300171,
+      "learning_rate": 4.9705020218228695e-05,
+      "loss": 0.7272,
+      "step": 2945
+    },
+    {
+      "epoch": 0.04297348813030671,
+      "grad_norm": 1.6967558860778809,
+      "learning_rate": 4.969201777632205e-05,
+      "loss": 0.7191,
+      "step": 2976
+    },
+    {
+      "epoch": 0.043421128631664065,
+      "grad_norm": 1.6656996011734009,
+      "learning_rate": 4.9678736688350846e-05,
+      "loss": 0.7205,
+      "step": 3007
+    },
+    {
+      "epoch": 0.04386876913302143,
+      "grad_norm": 2.151475191116333,
+      "learning_rate": 4.966517710419033e-05,
+      "loss": 0.7168,
+      "step": 3038
+    },
+    {
+      "epoch": 0.044316409634378794,
+      "grad_norm": 2.213109016418457,
+      "learning_rate": 4.965133917685858e-05,
+      "loss": 0.7139,
+      "step": 3069
+    },
+    {
+      "epoch": 0.04476405013573615,
+      "grad_norm": 1.5380377769470215,
+      "learning_rate": 4.9637223062514714e-05,
+      "loss": 0.7237,
+      "step": 3100
+    },
+    {
+      "epoch": 0.045211690637093516,
+      "grad_norm": 2.312377452850342,
+      "learning_rate": 4.962282892045718e-05,
+      "loss": 0.7156,
+      "step": 3131
+    },
+    {
+      "epoch": 0.04565933113845087,
+      "grad_norm": 1.7220717668533325,
+      "learning_rate": 4.9608156913121904e-05,
+      "loss": 0.7122,
+      "step": 3162
+    },
+    {
+      "epoch": 0.04610697163980824,
+      "grad_norm": 1.802856206893921,
+      "learning_rate": 4.959320720608049e-05,
+      "loss": 0.7128,
+      "step": 3193
+    },
+    {
+      "epoch": 0.046554612141165595,
+      "grad_norm": 1.6629964113235474,
+      "learning_rate": 4.9577979968038354e-05,
+      "loss": 0.7172,
+      "step": 3224
+    },
+    {
+      "epoch": 0.04700225264252296,
+      "grad_norm": 3.440115213394165,
+      "learning_rate": 4.956247537083282e-05,
+      "loss": 0.7213,
+      "step": 3255
+    },
+    {
+      "epoch": 0.047449893143880324,
+      "grad_norm": 1.5721139907836914,
+      "learning_rate": 4.9546693589431145e-05,
+      "loss": 0.7148,
+      "step": 3286
+    },
+    {
+      "epoch": 0.04789753364523768,
+      "grad_norm": 2.0920398235321045,
+      "learning_rate": 4.9530634801928595e-05,
+      "loss": 0.7145,
+      "step": 3317
+    },
+    {
+      "epoch": 0.048345174146595046,
+      "grad_norm": 1.666566014289856,
+      "learning_rate": 4.9514299189546395e-05,
+      "loss": 0.7095,
+      "step": 3348
+    },
+    {
+      "epoch": 0.048792814647952404,
+      "grad_norm": 1.8222129344940186,
+      "learning_rate": 4.949768693662973e-05,
+      "loss": 0.7138,
+      "step": 3379
+    },
+    {
+      "epoch": 0.04924045514930977,
+      "grad_norm": 1.7302964925765991,
+      "learning_rate": 4.948079823064559e-05,
+      "loss": 0.7017,
+      "step": 3410
+    },
+    {
+      "epoch": 0.049688095650667126,
+      "grad_norm": 1.7338463068008423,
+      "learning_rate": 4.946363326218074e-05,
+      "loss": 0.6979,
+      "step": 3441
+    },
+    {
+      "epoch": 0.05013573615202449,
+      "grad_norm": 1.5637450218200684,
+      "learning_rate": 4.9446192224939525e-05,
+      "loss": 0.7011,
+      "step": 3472
+    },
+    {
+      "epoch": 0.050583376653381855,
+      "grad_norm": 1.5632222890853882,
+      "learning_rate": 4.942847531574167e-05,
+      "loss": 0.704,
+      "step": 3503
+    },
+    {
+      "epoch": 0.05103101715473921,
+      "grad_norm": 1.588402509689331,
+      "learning_rate": 4.941048273452008e-05,
+      "loss": 0.7011,
+      "step": 3534
+    },
+    {
+      "epoch": 0.05147865765609658,
+      "grad_norm": 1.8840582370758057,
+      "learning_rate": 4.9392214684318605e-05,
+      "loss": 0.7016,
+      "step": 3565
+    },
+    {
+      "epoch": 0.051926298157453935,
+      "grad_norm": 1.2702268362045288,
+      "learning_rate": 4.93736713712897e-05,
+      "loss": 0.7004,
+      "step": 3596
+    },
+    {
+      "epoch": 0.0523739386588113,
+      "grad_norm": 1.3812692165374756,
+      "learning_rate": 4.9354853004692124e-05,
+      "loss": 0.7046,
+      "step": 3627
+    },
+    {
+      "epoch": 0.05282157916016866,
+      "grad_norm": 1.7257345914840698,
+      "learning_rate": 4.93357597968886e-05,
+      "loss": 0.6976,
+      "step": 3658
+    },
+    {
+      "epoch": 0.05326921966152602,
+      "grad_norm": 1.7458925247192383,
+      "learning_rate": 4.931639196334338e-05,
+      "loss": 0.6997,
+      "step": 3689
+    },
+    {
+      "epoch": 0.053716860162883386,
+      "grad_norm": 2.1996099948883057,
+      "learning_rate": 4.9296749722619826e-05,
+      "loss": 0.6991,
+      "step": 3720
+    },
+    {
+      "epoch": 0.05416450066424074,
+      "grad_norm": 1.6615021228790283,
+      "learning_rate": 4.9276833296377966e-05,
+      "loss": 0.7005,
+      "step": 3751
+    },
+    {
+      "epoch": 0.05461214116559811,
+      "grad_norm": 1.6276952028274536,
+      "learning_rate": 4.925664290937196e-05,
+      "loss": 0.7097,
+      "step": 3782
+    },
+    {
+      "epoch": 0.055059781666955465,
+      "grad_norm": 1.758227825164795,
+      "learning_rate": 4.9236178789447576e-05,
+      "loss": 0.6955,
+      "step": 3813
+    },
+    {
+      "epoch": 0.05550742216831283,
+      "grad_norm": 1.195280909538269,
+      "learning_rate": 4.921544116753962e-05,
+      "loss": 0.7073,
+      "step": 3844
+    },
+    {
+      "epoch": 0.05595506266967019,
+      "grad_norm": 1.6281015872955322,
+      "learning_rate": 4.919443027766935e-05,
+      "loss": 0.7022,
+      "step": 3875
+    },
+    {
+      "epoch": 0.05640270317102755,
+      "grad_norm": 1.3543150424957275,
+      "learning_rate": 4.91731463569418e-05,
+      "loss": 0.7036,
+      "step": 3906
+    },
+    {
+      "epoch": 0.056850343672384916,
+      "grad_norm": 2.16947078704834,
+      "learning_rate": 4.915158964554312e-05,
+      "loss": 0.7007,
+      "step": 3937
+    },
+    {
+      "epoch": 0.057297984173742274,
+      "grad_norm": 1.324578881263733,
+      "learning_rate": 4.912976038673786e-05,
+      "loss": 0.6941,
+      "step": 3968
+    },
+    {
+      "epoch": 0.05774562467509964,
+      "grad_norm": 1.9811108112335205,
+      "learning_rate": 4.9107658826866254e-05,
+      "loss": 0.6908,
+      "step": 3999
+    },
+    {
+      "epoch": 0.058193265176456996,
+      "grad_norm": 1.2975554466247559,
+      "learning_rate": 4.908528521534139e-05,
+      "loss": 0.6936,
+      "step": 4030
+    },
+    {
+      "epoch": 0.05864090567781436,
+      "grad_norm": 1.583282232284546,
+      "learning_rate": 4.906263980464644e-05,
+      "loss": 0.698,
+      "step": 4061
+    },
+    {
+      "epoch": 0.05908854617917172,
+      "grad_norm": 1.3532944917678833,
+      "learning_rate": 4.903972285033178e-05,
+      "loss": 0.7049,
+      "step": 4092
+    },
+    {
+      "epoch": 0.05953618668052908,
+      "grad_norm": 2.1245481967926025,
+      "learning_rate": 4.901653461101213e-05,
+      "loss": 0.7016,
+      "step": 4123
+    },
+    {
+      "epoch": 0.05998382718188645,
+      "grad_norm": 1.6913797855377197,
+      "learning_rate": 4.8993075348363626e-05,
+      "loss": 0.6981,
+      "step": 4154
+    },
+    {
+      "epoch": 0.060431467683243804,
+      "grad_norm": 1.51249098777771,
+      "learning_rate": 4.896934532712084e-05,
+      "loss": 0.6955,
+      "step": 4185
+    },
+    {
+      "epoch": 0.06087910818460117,
+      "grad_norm": 1.3880395889282227,
+      "learning_rate": 4.8945344815073846e-05,
+      "loss": 0.6934,
+      "step": 4216
+    },
+    {
+      "epoch": 0.061326748685958526,
+      "grad_norm": 1.6354159116744995,
+      "learning_rate": 4.892107408306516e-05,
+      "loss": 0.6938,
+      "step": 4247
+    },
+    {
+      "epoch": 0.06177438918731589,
+      "grad_norm": 2.126742362976074,
+      "learning_rate": 4.889653340498669e-05,
+      "loss": 0.7003,
+      "step": 4278
+    },
+    {
+      "epoch": 0.06222202968867325,
+      "grad_norm": 1.7903707027435303,
+      "learning_rate": 4.8871723057776664e-05,
+      "loss": 0.6885,
+      "step": 4309
+    },
+    {
+      "epoch": 0.06266967019003061,
+      "grad_norm": 1.537806510925293,
+      "learning_rate": 4.8846643321416476e-05,
+      "loss": 0.6892,
+      "step": 4340
+    },
+    {
+      "epoch": 0.06311731069138797,
+      "grad_norm": 1.6445434093475342,
+      "learning_rate": 4.882129447892753e-05,
+      "loss": 0.6843,
+      "step": 4371
+    },
+    {
+      "epoch": 0.06356495119274534,
+      "grad_norm": 1.555373191833496,
+      "learning_rate": 4.8795676816368076e-05,
+      "loss": 0.6899,
+      "step": 4402
+    },
+    {
+      "epoch": 0.0640125916941027,
+      "grad_norm": 1.8370277881622314,
+      "learning_rate": 4.876979062282995e-05,
+      "loss": 0.6813,
+      "step": 4433
+    },
+    {
+      "epoch": 0.06446023219546006,
+      "grad_norm": 1.3132514953613281,
+      "learning_rate": 4.8743636190435325e-05,
+      "loss": 0.6832,
+      "step": 4464
+    },
+    {
+      "epoch": 0.06490787269681741,
+      "grad_norm": 1.3186298608779907,
+      "learning_rate": 4.871721381433344e-05,
+      "loss": 0.6879,
+      "step": 4495
+    },
+    {
+      "epoch": 0.06535551319817479,
+      "grad_norm": 1.4360268115997314,
+      "learning_rate": 4.869052379269719e-05,
+      "loss": 0.69,
+      "step": 4526
+    },
+    {
+      "epoch": 0.06580315369953214,
+      "grad_norm": 1.670765995979309,
+      "learning_rate": 4.866356642671985e-05,
+      "loss": 0.6865,
+      "step": 4557
+    },
+    {
+      "epoch": 0.0662507942008895,
+      "grad_norm": 1.7548723220825195,
+      "learning_rate": 4.8636342020611634e-05,
+      "loss": 0.6852,
+      "step": 4588
+    },
+    {
+      "epoch": 0.06669843470224687,
+      "grad_norm": 1.5086426734924316,
+      "learning_rate": 4.860885088159626e-05,
+      "loss": 0.6894,
+      "step": 4619
+    },
+    {
+      "epoch": 0.06714607520360423,
+      "grad_norm": 1.3140665292739868,
+      "learning_rate": 4.858109331990751e-05,
+      "loss": 0.6812,
+      "step": 4650
+    },
+    {
+      "epoch": 0.06759371570496159,
+      "grad_norm": 1.4212454557418823,
+      "learning_rate": 4.855306964878567e-05,
+      "loss": 0.6872,
+      "step": 4681
+    },
+    {
+      "epoch": 0.06804135620631895,
+      "grad_norm": 1.3034414052963257,
+      "learning_rate": 4.8524780184474084e-05,
+      "loss": 0.6901,
+      "step": 4712
+    },
+    {
+      "epoch": 0.06848899670767632,
+      "grad_norm": 1.3741438388824463,
+      "learning_rate": 4.8496225246215496e-05,
+      "loss": 0.6875,
+      "step": 4743
+    },
+    {
+      "epoch": 0.06893663720903367,
+      "grad_norm": 1.7262542247772217,
+      "learning_rate": 4.8467405156248505e-05,
+      "loss": 0.6868,
+      "step": 4774
+    },
+    {
+      "epoch": 0.06938427771039103,
+      "grad_norm": 1.3293650150299072,
+      "learning_rate": 4.843832023980392e-05,
+      "loss": 0.6891,
+      "step": 4805
+    },
+    {
+      "epoch": 0.0698319182117484,
+      "grad_norm": 1.3448151350021362,
+      "learning_rate": 4.840897082510106e-05,
+      "loss": 0.6765,
+      "step": 4836
+    },
+    {
+      "epoch": 0.07027955871310576,
+      "grad_norm": 2.961280584335327,
+      "learning_rate": 4.8379357243344084e-05,
+      "loss": 0.6939,
+      "step": 4867
+    },
+    {
+      "epoch": 0.07072719921446312,
+      "grad_norm": 1.8265361785888672,
+      "learning_rate": 4.8349479828718236e-05,
+      "loss": 0.677,
+      "step": 4898
+    },
+    {
+      "epoch": 0.07117483971582048,
+      "grad_norm": 1.490349531173706,
+      "learning_rate": 4.8319338918386075e-05,
+      "loss": 0.6778,
+      "step": 4929
+    },
+    {
+      "epoch": 0.07162248021717785,
+      "grad_norm": 1.3669307231903076,
+      "learning_rate": 4.828893485248369e-05,
+      "loss": 0.6746,
+      "step": 4960
+    },
+    {
+      "epoch": 0.0720701207185352,
+      "grad_norm": 1.3995884656906128,
+      "learning_rate": 4.825826797411682e-05,
+      "loss": 0.6757,
+      "step": 4991
+    },
+    {
+      "epoch": 0.07251776121989256,
+      "grad_norm": 1.1217372417449951,
+      "learning_rate": 4.822733862935702e-05,
+      "loss": 0.6832,
+      "step": 5022
+    },
+    {
+      "epoch": 0.07296540172124993,
+      "grad_norm": 1.2192097902297974,
+      "learning_rate": 4.819614716723775e-05,
+      "loss": 0.6868,
+      "step": 5053
+    },
+    {
+      "epoch": 0.07341304222260729,
+      "grad_norm": 1.5045067071914673,
+      "learning_rate": 4.8164693939750425e-05,
+      "loss": 0.6793,
+      "step": 5084
+    },
+    {
+      "epoch": 0.07386068272396465,
+      "grad_norm": 1.7127234935760498,
+      "learning_rate": 4.813297930184042e-05,
+      "loss": 0.6797,
+      "step": 5115
+    },
+    {
+      "epoch": 0.074308323225322,
+      "grad_norm": 1.846561312675476,
+      "learning_rate": 4.810100361140314e-05,
+      "loss": 0.6767,
+      "step": 5146
+    },
+    {
+      "epoch": 0.07475596372667938,
+      "grad_norm": 1.3076797723770142,
+      "learning_rate": 4.8068767229279885e-05,
+      "loss": 0.6855,
+      "step": 5177
+    },
+    {
+      "epoch": 0.07520360422803674,
+      "grad_norm": 1.4170383214950562,
+      "learning_rate": 4.8036270519253854e-05,
+      "loss": 0.681,
+      "step": 5208
+    },
+    {
+      "epoch": 0.0756512447293941,
+      "grad_norm": 1.2504942417144775,
+      "learning_rate": 4.8003513848046e-05,
+      "loss": 0.6778,
+      "step": 5239
+    },
+    {
+      "epoch": 0.07609888523075146,
+      "grad_norm": 1.1522283554077148,
+      "learning_rate": 4.79704975853109e-05,
+      "loss": 0.6749,
+      "step": 5270
+    },
+    {
+      "epoch": 0.07654652573210882,
+      "grad_norm": 1.6351525783538818,
+      "learning_rate": 4.793722210363262e-05,
+      "loss": 0.6745,
+      "step": 5301
+    },
+    {
+      "epoch": 0.07699416623346618,
+      "grad_norm": 1.5093014240264893,
+      "learning_rate": 4.7903687778520414e-05,
+      "loss": 0.6747,
+      "step": 5332
+    },
+    {
+      "epoch": 0.07744180673482354,
+      "grad_norm": 1.362160563468933,
+      "learning_rate": 4.7869894988404593e-05,
+      "loss": 0.673,
+      "step": 5363
+    },
+    {
+      "epoch": 0.07788944723618091,
+      "grad_norm": 1.2021727561950684,
+      "learning_rate": 4.783584411463221e-05,
+      "loss": 0.6768,
+      "step": 5394
+    },
+    {
+      "epoch": 0.07833708773753827,
+      "grad_norm": 2.1543540954589844,
+      "learning_rate": 4.780153554146274e-05,
+      "loss": 0.672,
+      "step": 5425
+    },
+    {
+      "epoch": 0.07878472823889562,
+      "grad_norm": 1.882712721824646,
+      "learning_rate": 4.7766969656063766e-05,
+      "loss": 0.6926,
+      "step": 5456
+    },
+    {
+      "epoch": 0.079232368740253,
+      "grad_norm": 1.3975650072097778,
+      "learning_rate": 4.773214684850662e-05,
+      "loss": 0.6747,
+      "step": 5487
+    },
+    {
+      "epoch": 0.07968000924161035,
+      "grad_norm": 1.3912913799285889,
+      "learning_rate": 4.769706751176193e-05,
+      "loss": 0.6756,
+      "step": 5518
+    },
+    {
+      "epoch": 0.08012764974296771,
+      "grad_norm": 1.7227635383605957,
+      "learning_rate": 4.7661732041695264e-05,
+      "loss": 0.6694,
+      "step": 5549
+    },
+    {
+      "epoch": 0.08057529024432507,
+      "grad_norm": 1.3151129484176636,
+      "learning_rate": 4.762614083706258e-05,
+      "loss": 0.6715,
+      "step": 5580
+    },
+    {
+      "epoch": 0.08102293074568244,
+      "grad_norm": 1.0972425937652588,
+      "learning_rate": 4.759029429950581e-05,
+      "loss": 0.6661,
+      "step": 5611
+    },
+    {
+      "epoch": 0.0814705712470398,
+      "grad_norm": 1.2346575260162354,
+      "learning_rate": 4.7554192833548235e-05,
+      "loss": 0.66,
+      "step": 5642
+    },
+    {
+      "epoch": 0.08191821174839715,
+      "grad_norm": 1.4536516666412354,
+      "learning_rate": 4.751783684659e-05,
+      "loss": 0.6743,
+      "step": 5673
+    },
+    {
+      "epoch": 0.08236585224975453,
+      "grad_norm": 1.1361631155014038,
+      "learning_rate": 4.748122674890348e-05,
+      "loss": 0.6791,
+      "step": 5704
+    },
+    {
+      "epoch": 0.08281349275111188,
+      "grad_norm": 1.2605111598968506,
+      "learning_rate": 4.7444362953628654e-05,
+      "loss": 0.6797,
+      "step": 5735
+    },
+    {
+      "epoch": 0.08326113325246924,
+      "grad_norm": 1.2355903387069702,
+      "learning_rate": 4.7407245876768424e-05,
+      "loss": 0.6642,
+      "step": 5766
+    },
+    {
+      "epoch": 0.0837087737538266,
+      "grad_norm": 1.6677048206329346,
+      "learning_rate": 4.736987593718397e-05,
+      "loss": 0.6759,
+      "step": 5797
+    },
+    {
+      "epoch": 0.08415641425518397,
+      "grad_norm": 1.4781981706619263,
+      "learning_rate": 4.733225355658999e-05,
+      "loss": 0.6707,
+      "step": 5828
+    },
+    {
+      "epoch": 0.08460405475654133,
+      "grad_norm": 1.138583779335022,
+      "learning_rate": 4.7294379159549926e-05,
+      "loss": 0.6636,
+      "step": 5859
+    },
+    {
+      "epoch": 0.08505169525789869,
+      "grad_norm": 1.529036283493042,
+      "learning_rate": 4.725625317347119e-05,
+      "loss": 0.6705,
+      "step": 5890
+    },
+    {
+      "epoch": 0.08549933575925606,
+      "grad_norm": 1.3216760158538818,
+      "learning_rate": 4.7217876028600374e-05,
+      "loss": 0.6714,
+      "step": 5921
+    },
+    {
+      "epoch": 0.08594697626061341,
+      "grad_norm": 1.1820168495178223,
+      "learning_rate": 4.717924815801832e-05,
+      "loss": 0.6757,
+      "step": 5952
+    },
+    {
+      "epoch": 0.08639461676197077,
+      "grad_norm": 1.393571138381958,
+      "learning_rate": 4.714036999763532e-05,
+      "loss": 0.6672,
+      "step": 5983
+    },
+    {
+      "epoch": 0.08684225726332813,
+      "grad_norm": 1.4574682712554932,
+      "learning_rate": 4.7101241986186116e-05,
+      "loss": 0.6655,
+      "step": 6014
+    },
+    {
+      "epoch": 0.0872898977646855,
+      "grad_norm": 1.138645887374878,
+      "learning_rate": 4.7061864565225e-05,
+      "loss": 0.6663,
+      "step": 6045
+    },
+    {
+      "epoch": 0.08773753826604286,
+      "grad_norm": 1.7602777481079102,
+      "learning_rate": 4.702223817912081e-05,
+      "loss": 0.6695,
+      "step": 6076
+    },
+    {
+      "epoch": 0.08818517876740022,
+      "grad_norm": 1.2323459386825562,
+      "learning_rate": 4.698236327505195e-05,
+      "loss": 0.6636,
+      "step": 6107
+    },
+    {
+      "epoch": 0.08863281926875759,
+      "grad_norm": 1.6881431341171265,
+      "learning_rate": 4.694224030300127e-05,
+      "loss": 0.6653,
+      "step": 6138
+    },
+    {
+      "epoch": 0.08908045977011494,
+      "grad_norm": 1.391417384147644,
+      "learning_rate": 4.690186971575107e-05,
+      "loss": 0.6636,
+      "step": 6169
+    },
+    {
+      "epoch": 0.0895281002714723,
+      "grad_norm": 1.3066257238388062,
+      "learning_rate": 4.6861251968877916e-05,
+      "loss": 0.6777,
+      "step": 6200
+    },
+    {
+      "epoch": 0.08997574077282966,
+      "grad_norm": 1.2001326084136963,
+      "learning_rate": 4.68203875207476e-05,
+      "loss": 0.6683,
+      "step": 6231
+    },
+    {
+      "epoch": 0.09042338127418703,
+      "grad_norm": 1.4361172914505005,
+      "learning_rate": 4.677927683250983e-05,
+      "loss": 0.6634,
+      "step": 6262
+    },
+    {
+      "epoch": 0.09087102177554439,
+      "grad_norm": 8.04520320892334,
+      "learning_rate": 4.6737920368093156e-05,
+      "loss": 0.6752,
+      "step": 6293
+    },
+    {
+      "epoch": 0.09131866227690175,
+      "grad_norm": 1.4874210357666016,
+      "learning_rate": 4.669631859419965e-05,
+      "loss": 0.6733,
+      "step": 6324
+    },
+    {
+      "epoch": 0.09176630277825912,
+      "grad_norm": 1.234491229057312,
+      "learning_rate": 4.6654471980299676e-05,
+      "loss": 0.668,
+      "step": 6355
+    },
+    {
+      "epoch": 0.09221394327961648,
+      "grad_norm": 1.2088687419891357,
+      "learning_rate": 4.661238099862658e-05,
+      "loss": 0.6705,
+      "step": 6386
+    },
+    {
+      "epoch": 0.09266158378097383,
+      "grad_norm": 1.1937814950942993,
+      "learning_rate": 4.657004612417138e-05,
+      "loss": 0.6853,
+      "step": 6417
+    },
+    {
+      "epoch": 0.09310922428233119,
+      "grad_norm": 1.5205374956130981,
+      "learning_rate": 4.6527467834677374e-05,
+      "loss": 0.685,
+      "step": 6448
+    },
+    {
+      "epoch": 0.09355686478368856,
+      "grad_norm": 1.2221660614013672,
+      "learning_rate": 4.648464661063478e-05,
+      "loss": 0.6622,
+      "step": 6479
+    },
+    {
+      "epoch": 0.09400450528504592,
+      "grad_norm": 1.0762608051300049,
+      "learning_rate": 4.6441582935275264e-05,
+      "loss": 0.669,
+      "step": 6510
+    },
+    {
+      "epoch": 0.09445214578640328,
+      "grad_norm": 1.4416946172714233,
+      "learning_rate": 4.6398277294566586e-05,
+      "loss": 0.6674,
+      "step": 6541
+    },
+    {
+      "epoch": 0.09489978628776065,
+      "grad_norm": 1.559158205986023,
+      "learning_rate": 4.6354730177207e-05,
+      "loss": 0.6681,
+      "step": 6572
+    },
+    {
+      "epoch": 0.095347426789118,
+      "grad_norm": 1.3833891153335571,
+      "learning_rate": 4.6310942074619787e-05,
+      "loss": 0.6681,
+      "step": 6603
+    },
+    {
+      "epoch": 0.09579506729047536,
+      "grad_norm": 1.6753300428390503,
+      "learning_rate": 4.626691348094777e-05,
+      "loss": 0.6658,
+      "step": 6634
+    },
+    {
+      "epoch": 0.09624270779183272,
+      "grad_norm": 1.951198697090149,
+      "learning_rate": 4.622264489304762e-05,
+      "loss": 0.6654,
+      "step": 6665
+    },
+    {
+      "epoch": 0.09669034829319009,
+      "grad_norm": 1.2356919050216675,
+      "learning_rate": 4.617813681048434e-05,
+      "loss": 0.6651,
+      "step": 6696
+    },
+    {
+      "epoch": 0.09713798879454745,
+      "grad_norm": 1.2712593078613281,
+      "learning_rate": 4.61333897355256e-05,
+      "loss": 0.6646,
+      "step": 6727
+    },
+    {
+      "epoch": 0.09758562929590481,
+      "grad_norm": 1.1935900449752808,
+      "learning_rate": 4.608840417313604e-05,
+      "loss": 0.674,
+      "step": 6758
+    },
+    {
+      "epoch": 0.09803326979726218,
+      "grad_norm": 1.1649430990219116,
+      "learning_rate": 4.6043180630971646e-05,
+      "loss": 0.6644,
+      "step": 6789
+    },
+    {
+      "epoch": 0.09848091029861954,
+      "grad_norm": 1.4281456470489502,
+      "learning_rate": 4.599771961937391e-05,
+      "loss": 0.6673,
+      "step": 6820
+    },
+    {
+      "epoch": 0.0989285507999769,
+      "grad_norm": 1.3064521551132202,
+      "learning_rate": 4.5952021651364204e-05,
+      "loss": 0.6584,
+      "step": 6851
+    },
+    {
+      "epoch": 0.09937619130133425,
+      "grad_norm": 1.2546554803848267,
+      "learning_rate": 4.590608724263786e-05,
+      "loss": 0.6612,
+      "step": 6882
+    },
+    {
+      "epoch": 0.09982383180269162,
+      "grad_norm": 1.1866974830627441,
+      "learning_rate": 4.585991691155845e-05,
+      "loss": 0.6612,
+      "step": 6913
+    },
+    {
+      "epoch": 0.10027147230404898,
+      "grad_norm": 1.6166640520095825,
+      "learning_rate": 4.581351117915188e-05,
+      "loss": 0.6551,
+      "step": 6944
+    },
+    {
+      "epoch": 0.10071911280540634,
+      "grad_norm": 1.5471700429916382,
+      "learning_rate": 4.5766870569100534e-05,
+      "loss": 0.6607,
+      "step": 6975
+    },
+    {
+      "epoch": 0.10116675330676371,
+      "grad_norm": 1.3361026048660278,
+      "learning_rate": 4.571999560773736e-05,
+      "loss": 0.666,
+      "step": 7006
+    },
+    {
+      "epoch": 0.10161439380812107,
+      "grad_norm": 1.2938140630722046,
+      "learning_rate": 4.5672886824039915e-05,
+      "loss": 0.6547,
+      "step": 7037
+    },
+    {
+      "epoch": 0.10206203430947842,
+      "grad_norm": 1.2688400745391846,
+      "learning_rate": 4.5625544749624435e-05,
+      "loss": 0.6624,
+      "step": 7068
+    },
+    {
+      "epoch": 0.10250967481083578,
+      "grad_norm": 1.6306285858154297,
+      "learning_rate": 4.5577969918739794e-05,
+      "loss": 0.6627,
+      "step": 7099
+    },
+    {
+      "epoch": 0.10295731531219315,
+      "grad_norm": 1.3346176147460938,
+      "learning_rate": 4.5530162868261486e-05,
+      "loss": 0.6577,
+      "step": 7130
+    },
+    {
+      "epoch": 0.10340495581355051,
+      "grad_norm": 1.0933984518051147,
+      "learning_rate": 4.548212413768558e-05,
+      "loss": 0.6602,
+      "step": 7161
+    },
+    {
+      "epoch": 0.10385259631490787,
+      "grad_norm": 1.575859785079956,
+      "learning_rate": 4.543385426912261e-05,
+      "loss": 0.6593,
+      "step": 7192
+    },
+    {
+      "epoch": 0.10430023681626524,
+      "grad_norm": 1.4265861511230469,
+      "learning_rate": 4.53853538072915e-05,
+      "loss": 0.6564,
+      "step": 7223
+    },
+    {
+      "epoch": 0.1047478773176226,
+      "grad_norm": 1.737012267112732,
+      "learning_rate": 4.533662329951336e-05,
+      "loss": 0.6593,
+      "step": 7254
+    },
+    {
+      "epoch": 0.10519551781897996,
+      "grad_norm": 1.0257115364074707,
+      "learning_rate": 4.528766329570536e-05,
+      "loss": 0.6514,
+      "step": 7285
+    },
+    {
+      "epoch": 0.10564315832033731,
+      "grad_norm": 1.5043773651123047,
+      "learning_rate": 4.523847434837447e-05,
+      "loss": 0.6635,
+      "step": 7316
+    },
+    {
+      "epoch": 0.10609079882169468,
+      "grad_norm": 1.5642234086990356,
+      "learning_rate": 4.518905701261128e-05,
+      "loss": 0.6558,
+      "step": 7347
+    },
+    {
+      "epoch": 0.10653843932305204,
+      "grad_norm": 1.1821067333221436,
+      "learning_rate": 4.5139411846083715e-05,
+      "loss": 0.6686,
+      "step": 7378
+    },
+    {
+      "epoch": 0.1069860798244094,
+      "grad_norm": 1.5492759943008423,
+      "learning_rate": 4.508953940903073e-05,
+      "loss": 0.6543,
+      "step": 7409
+    },
+    {
+      "epoch": 0.10743372032576677,
+      "grad_norm": 1.281914234161377,
+      "learning_rate": 4.5039440264255994e-05,
+      "loss": 0.6516,
+      "step": 7440
+    },
+    {
+      "epoch": 0.10788136082712413,
+      "grad_norm": 1.3318305015563965,
+      "learning_rate": 4.498911497712155e-05,
+      "loss": 0.656,
+      "step": 7471
+    },
+    {
+      "epoch": 0.10832900132848149,
+      "grad_norm": 1.3832449913024902,
+      "learning_rate": 4.493856411554142e-05,
+      "loss": 0.6475,
+      "step": 7502
+    },
+    {
+      "epoch": 0.10877664182983884,
+      "grad_norm": 1.3547158241271973,
+      "learning_rate": 4.4887788249975206e-05,
+      "loss": 0.6594,
+      "step": 7533
+    },
+    {
+      "epoch": 0.10922428233119622,
+      "grad_norm": 1.4633681774139404,
+      "learning_rate": 4.4836787953421656e-05,
+      "loss": 0.6707,
+      "step": 7564
+    },
+    {
+      "epoch": 0.10967192283255357,
+      "grad_norm": 1.1781059503555298,
+      "learning_rate": 4.478556380141218e-05,
+      "loss": 0.6626,
+      "step": 7595
+    },
+    {
+      "epoch": 0.11011956333391093,
+      "grad_norm": 1.4727883338928223,
+      "learning_rate": 4.4734116372004375e-05,
+      "loss": 0.6535,
+      "step": 7626
+    }
+  ],
+  "logging_steps": 31,
+  "max_steps": 30517,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 7630,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.629140197116477e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-7630/training_args.bin b/checkpoint-7630/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542
--- /dev/null
+++ b/checkpoint-7630/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3
+size 5432
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8b10d032ea5b0e15388eb607d24979f38775a09
--- /dev/null
+++ b/config.json
@@ -0,0 +1,36 @@
+{
+  "_name_or_path": "meta-llama/Llama-3.1-8B",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..73099b4966679656fb84b344ea08b348a6a6a507
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": 128001,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.47.0.dev0"
+}
diff --git a/model-00001-of-00007.safetensors b/model-00001-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..542f9d7381f168eb0b0a63a75a9adf93a5deee06
--- /dev/null
+++ b/model-00001-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13878f97ef55d85d9b352c717dba406c909afe1bae3e88a6a4424a428c0bccc6
+size 4886466168
diff --git a/model-00002-of-00007.safetensors b/model-00002-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1f266ba474079d34baf9b2a204bc3cecae676961
--- /dev/null
+++ b/model-00002-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0901d2e156c6ec7a375c3e90ae5183970af0445d3dc0e53c031b45673775dd64
+size 4832007448
diff --git a/model-00003-of-00007.safetensors b/model-00003-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5501e045b181a6449b2b0997f4ca9f8ce68156ff
--- /dev/null
+++ b/model-00003-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afa0e22bcade9527b5aeb86066e139cc30979f8492c753ff3277e6189863ab97
+size 4999813112
diff --git a/model-00004-of-00007.safetensors b/model-00004-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ea5edb447c5d0bf0491276907abf457c5b0f94a
--- /dev/null
+++ b/model-00004-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc4f5352f30ea036b0a4e24ba9b029dd94a2b30a716c81acc1c4a5e7bb47a042
+size 4999813128
diff --git a/model-00005-of-00007.safetensors b/model-00005-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d308635d86f98645d1b129703f4ba4ecc6a84a89
--- /dev/null
+++ b/model-00005-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d9450b209df4cb026b6669e618fcdb6ff4f5bd0a711fb2e502afe0b95eb3ad7
+size 4832007496
diff --git a/model-00006-of-00007.safetensors b/model-00006-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..524e69fde1febd3510b4725ea7cfc5103d79e8d1
--- /dev/null
+++ b/model-00006-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:edf01cd5fe74bfd002b701e409ad8e68a02b64c59f6cfb2302b9427953c464f3
+size 4999813120
diff --git a/model-00007-of-00007.safetensors b/model-00007-of-00007.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b73e0db4c4b0097276bbc78a364a5ff57002d272
--- /dev/null
+++ b/model-00007-of-00007.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9dd6df4fc7f009aa39a3b019a55cb97a8509bb44690419b3255311faaf9e89b
+size 2571158184
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,298 @@
+{
+  "metadata": {
+    "total_size": 32121044992
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "model.norm.weight": "model-00007-of-00007.safetensors"
+  }
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,17 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|end_of_text|>"
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0df36e82e06f1188c56f572211c39d7d52f1f46e
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2062 @@
+{
+  "added_tokens_decoder": {
+    "128000": {
+      "content": "<|begin_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|end_of_text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|finetune_right_pad_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|start_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<|end_header_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<|eom_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<|eot_id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<|python_tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128041": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128042": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128043": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128044": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128045": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128046": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128047": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128048": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128049": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128050": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128051": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128052": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|reserved_special_token_88|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|reserved_special_token_89|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|reserved_special_token_90|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|reserved_special_token_91|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|reserved_special_token_92|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|reserved_special_token_93|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|reserved_special_token_94|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|reserved_special_token_95|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|reserved_special_token_96|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|reserved_special_token_97|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|reserved_special_token_98|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|reserved_special_token_99|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|reserved_special_token_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|reserved_special_token_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|reserved_special_token_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|reserved_special_token_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|reserved_special_token_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|reserved_special_token_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|reserved_special_token_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|reserved_special_token_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|reserved_special_token_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|reserved_special_token_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|reserved_special_token_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|reserved_special_token_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|reserved_special_token_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|reserved_special_token_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|reserved_special_token_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|reserved_special_token_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|reserved_special_token_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|reserved_special_token_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|reserved_special_token_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|reserved_special_token_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|reserved_special_token_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|reserved_special_token_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|reserved_special_token_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|reserved_special_token_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|reserved_special_token_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|reserved_special_token_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|reserved_special_token_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|reserved_special_token_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|reserved_special_token_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|reserved_special_token_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|reserved_special_token_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|reserved_special_token_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|reserved_special_token_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|reserved_special_token_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|reserved_special_token_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|reserved_special_token_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|reserved_special_token_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|reserved_special_token_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|reserved_special_token_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|reserved_special_token_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|reserved_special_token_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|reserved_special_token_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|reserved_special_token_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|reserved_special_token_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|reserved_special_token_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|reserved_special_token_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|reserved_special_token_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|reserved_special_token_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|reserved_special_token_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|reserved_special_token_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|reserved_special_token_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|reserved_special_token_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|reserved_special_token_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|reserved_special_token_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|reserved_special_token_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|reserved_special_token_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|reserved_special_token_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|reserved_special_token_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|reserved_special_token_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|reserved_special_token_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|reserved_special_token_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|reserved_special_token_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|reserved_special_token_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|reserved_special_token_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|reserved_special_token_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|reserved_special_token_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|reserved_special_token_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|reserved_special_token_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|reserved_special_token_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|reserved_special_token_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|reserved_special_token_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|reserved_special_token_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|reserved_special_token_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|reserved_special_token_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|reserved_special_token_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|reserved_special_token_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|reserved_special_token_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|reserved_special_token_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|reserved_special_token_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|reserved_special_token_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|reserved_special_token_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|reserved_special_token_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|reserved_special_token_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|reserved_special_token_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|reserved_special_token_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|reserved_special_token_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|reserved_special_token_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|reserved_special_token_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|reserved_special_token_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|reserved_special_token_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|reserved_special_token_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|reserved_special_token_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|reserved_special_token_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|reserved_special_token_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|reserved_special_token_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|reserved_special_token_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|reserved_special_token_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|reserved_special_token_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|reserved_special_token_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|reserved_special_token_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|reserved_special_token_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|reserved_special_token_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|reserved_special_token_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|reserved_special_token_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|reserved_special_token_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|reserved_special_token_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|reserved_special_token_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|reserved_special_token_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|reserved_special_token_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|reserved_special_token_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|reserved_special_token_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|reserved_special_token_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|reserved_special_token_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|reserved_special_token_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|reserved_special_token_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|reserved_special_token_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|reserved_special_token_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|reserved_special_token_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|reserved_special_token_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|reserved_special_token_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|reserved_special_token_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|reserved_special_token_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|reserved_special_token_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|reserved_special_token_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|reserved_special_token_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|reserved_special_token_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|reserved_special_token_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|reserved_special_token_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|reserved_special_token_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|reserved_special_token_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|reserved_special_token_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|reserved_special_token_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|reserved_special_token_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|reserved_special_token_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|reserved_special_token_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|reserved_special_token_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|reserved_special_token_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|reserved_special_token_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|reserved_special_token_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|reserved_special_token_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|reserved_special_token_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|reserved_special_token_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|reserved_special_token_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|reserved_special_token_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|reserved_special_token_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|reserved_special_token_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|reserved_special_token_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|reserved_special_token_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_text|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|end_of_text|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3ec322270d2d1a136ed4e6ec9a7cdf4837f5f542
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4073c7e634db6217c9c50dbc3e636b8c05df4a3686548baa23c4d653fa3409e3
+size 5432