diff --git a/checkpoint-0/config.json b/checkpoint-0/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-0/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-0/pytorch_model.bin b/checkpoint-0/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8d818f4046828eb7738fe21753ae72612233b90b
--- /dev/null
+++ b/checkpoint-0/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c042e3e79fe1248045081b9b91d7a1a79a839c21efc3919fe94d504c34664ecf
+size 510424169
diff --git a/checkpoint-0/special_tokens_map.json b/checkpoint-0/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-0/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-0/tokenizer_config.json b/checkpoint-0/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-0/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-0/training_args.bin b/checkpoint-0/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-0/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-100/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d8b2f593242f179cc3bab7fb899d8d13d88c67d0
--- /dev/null
+++ b/checkpoint-100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fed13c95e20c1735c47eb39480281dcfbf32a3731abebf4f032dcfb0967150eb
+size 995659121
diff --git a/checkpoint-100/pytorch_model.bin b/checkpoint-100/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..efcba7173f34a7456b1e1ca38f74e51ab6585199
--- /dev/null
+++ b/checkpoint-100/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3af64898abe8bc4fae14a2a2b1b3f95007f77876db6ddea30f933be7c897ed88
+size 510424169
diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..05af2246c84563c2e65febfbf063485c73bcaefc
--- /dev/null
+++ b/checkpoint-100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83fe06d38cbde496d05431dbc77717e82bc515a4e8ef899fb7c6958216c7755d
+size 14567
diff --git a/checkpoint-100/scaler.pt b/checkpoint-100/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..45cc4a33e17645cb0ed4a911b11c77cb2e7ce7f3
--- /dev/null
+++ b/checkpoint-100/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13a3423b2fe42f204bc8fe2c666ff379f9fd753a0f13613064a5e71e86b519e8
+size 559
diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..595eec9094e91b3eb24c2de88c461df2c22026ab
--- /dev/null
+++ b/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75e9b9d31d11c624d89b0c04ad496adf4b5addd3e703848d2583972c703e8da6
+size 623
diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-100/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-100/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a24e4a621d539efd264437be91d8e2063c9f0a4
--- /dev/null
+++ b/checkpoint-100/trainer_state.json
@@ -0,0 +1,34 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.015833333333333,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 2.703849947136e+16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-1000/config.json b/checkpoint-1000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-1000/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e809e961336227b687484632563dc95a252326d0
--- /dev/null
+++ b/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8bf65a620b942d3e732c8ff9c74f98cca6b60c839af2217a5f714aaaeb570e
+size 995659313
diff --git a/checkpoint-1000/pytorch_model.bin b/checkpoint-1000/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9e319a40b55b7300de772b2f9b43fe7e2c665db9
--- /dev/null
+++ b/checkpoint-1000/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c5eca7db51fec78fee1f801ddaae71c5cc75756018c51c5c146fe96166152f
+size 510424169
diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..1014b62ed04d8bbe004e782e624771c2f1c246ef
--- /dev/null
+++ b/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c75a64c4092fee04fa4a480496a8a8f0dd3c8dc62fe12ea9973d05836d1733ee
+size 14567
diff --git a/checkpoint-1000/scaler.pt b/checkpoint-1000/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b04695d3a30e4bab2b78883d9c849c25c37ef7d7
--- /dev/null
+++ b/checkpoint-1000/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f810fc7b695697c440d8985f6042b4ba23a9e1027604c265718b518ca29f1b2b
+size 559
diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a616899591a36b66ac0bd1ceeb324087c181a2a
--- /dev/null
+++ b/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0691206f4bd9ca409d6e7104087a4e0eb05df8f8f555a400f6ecc532edba52d8
+size 623
diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-1000/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-1000/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b655f1ae3cfcbd1c0b597a12a1c29e4e6f69fb94
--- /dev/null
+++ b/checkpoint-1000/trainer_state.json
@@ -0,0 +1,158 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 37.00083333333333,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.00030555555555555555,
+      "loss": 3.9943,
+      "step": 650
+    },
+    {
+      "epoch": 25.02,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.864,
+      "step": 700
+    },
+    {
+      "epoch": 27.02,
+      "learning_rate": 0.00025,
+      "loss": 3.805,
+      "step": 750
+    },
+    {
+      "epoch": 29.01,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 3.726,
+      "step": 800
+    },
+    {
+      "epoch": 31.01,
+      "learning_rate": 0.00019444444444444443,
+      "loss": 3.6562,
+      "step": 850
+    },
+    {
+      "epoch": 33.01,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 3.5922,
+      "step": 900
+    },
+    {
+      "epoch": 35.0,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 3.5355,
+      "step": 950
+    },
+    {
+      "epoch": 37.0,
+      "learning_rate": 0.00011111111111111109,
+      "loss": 3.4861,
+      "step": 1000
+    },
+    {
+      "epoch": 37.0,
+      "eval_loss": 4.404828071594238,
+      "eval_runtime": 4.6328,
+      "eval_samples_per_second": 22.017,
+      "eval_steps_per_second": 1.511,
+      "step": 1000
+    },
+    {
+      "epoch": 37.0,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_loss": 4.404828071594238,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_ppl": 81.84507014102485,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_runtime": 4.6328,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_samples_per_second": 22.017,
+      "step": 1000
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 2.7104345063424e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-1100/config.json b/checkpoint-1100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-1100/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-1100/optimizer.pt b/checkpoint-1100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..adfe562ee74ac231ff195b05e6cf7dc091a3bc97
--- /dev/null
+++ b/checkpoint-1100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1bfd4fdbef65f1742273631ec2a37aa81facf4e13958f76c0e641748f18ee3e
+size 995659313
diff --git a/checkpoint-1100/pytorch_model.bin b/checkpoint-1100/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fcc010729023dd84906713a860efcc06f80347e7
--- /dev/null
+++ b/checkpoint-1100/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e28f9fcf7c62b457c742cd933c7eb1a5386f7132ca13021e5da78624ce4b5f24
+size 510424169
diff --git a/checkpoint-1100/rng_state.pth b/checkpoint-1100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..af51324c7edee29ed500d47be016458df8232a40
--- /dev/null
+++ b/checkpoint-1100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1dc591192aac0a1beb3589bb6bd76d45a99911b830fa10fbb06a961cf60c5fc
+size 14567
diff --git a/checkpoint-1100/scaler.pt b/checkpoint-1100/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..37bf049fbd5fd721203bf0238edc8ff67dbd8f94
--- /dev/null
+++ b/checkpoint-1100/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb16c30b686aa43e110b0d33f9d46bf3127b7124542ca8dc34831233d4675a0
+size 559
diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c158169ec0bb09620952d544fd4b4edea0cc9cf4
--- /dev/null
+++ b/checkpoint-1100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f1b7713e4bb40428f29080b7d08d4a52f779ac863737861e4724292b2cf6c59
+size 623
diff --git a/checkpoint-1100/special_tokens_map.json b/checkpoint-1100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-1100/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-1100/tokenizer_config.json b/checkpoint-1100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-1100/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2fa5e8362c22a08911196e537560e7deda33a5fb
--- /dev/null
+++ b/checkpoint-1100/trainer_state.json
@@ -0,0 +1,170 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 40.016666666666666,
+  "global_step": 1100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.00030555555555555555,
+      "loss": 3.9943,
+      "step": 650
+    },
+    {
+      "epoch": 25.02,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.864,
+      "step": 700
+    },
+    {
+      "epoch": 27.02,
+      "learning_rate": 0.00025,
+      "loss": 3.805,
+      "step": 750
+    },
+    {
+      "epoch": 29.01,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 3.726,
+      "step": 800
+    },
+    {
+      "epoch": 31.01,
+      "learning_rate": 0.00019444444444444443,
+      "loss": 3.6562,
+      "step": 850
+    },
+    {
+      "epoch": 33.01,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 3.5922,
+      "step": 900
+    },
+    {
+      "epoch": 35.0,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 3.5355,
+      "step": 950
+    },
+    {
+      "epoch": 37.0,
+      "learning_rate": 0.00011111111111111109,
+      "loss": 3.4861,
+      "step": 1000
+    },
+    {
+      "epoch": 37.0,
+      "eval_loss": 4.404828071594238,
+      "eval_runtime": 4.6328,
+      "eval_samples_per_second": 22.017,
+      "eval_steps_per_second": 1.511,
+      "step": 1000
+    },
+    {
+      "epoch": 37.0,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_loss": 4.404828071594238,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_ppl": 81.84507014102485,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_runtime": 4.6328,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_samples_per_second": 22.017,
+      "step": 1000
+    },
+    {
+      "epoch": 38.02,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 3.4188,
+      "step": 1050
+    },
+    {
+      "epoch": 40.02,
+      "learning_rate": 5.5555555555555545e-05,
+      "loss": 3.4056,
+      "step": 1100
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 2.980819501056e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-1100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-1200/config.json b/checkpoint-1200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-1200/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a4f762718da5c500de7f36d761350eec63039866
--- /dev/null
+++ b/checkpoint-1200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a46a6e4f1bd7903cf1ecfc7c770fd346658ffec35ec7c5ce071acb7d40bf12fa
+size 995659313
diff --git a/checkpoint-1200/pytorch_model.bin b/checkpoint-1200/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38ac9954f469fafdf639fb422b47d67f8a40dc5b
--- /dev/null
+++ b/checkpoint-1200/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4e27e6700f2d4b2a7cc35e1bfec8f1cd9b7bd8425158df365f791e673267f38
+size 510424169
diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..be8d0f6e5cca3abc79a809208ffd0cffa83740f9
--- /dev/null
+++ b/checkpoint-1200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e44a1437033dc4a40dcd08d168e696536c0dd767a31742b456be2a56670f193d
+size 14567
diff --git a/checkpoint-1200/scaler.pt b/checkpoint-1200/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8953dddccbefc4703c09dcda27d83c15add2bade
--- /dev/null
+++ b/checkpoint-1200/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19c7277eaca0850ae3e9b6790b3d002d820169cce0671185e672c28c8ae8e056
+size 559
diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..310d39c17fc616a9c83286ed00f0f4cefba9f5df
--- /dev/null
+++ b/checkpoint-1200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:935a8fb09a6e9698d9894853b05e181b3f56098deaaecddde08e55f06bf000c4
+size 623
diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-1200/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-1200/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..483ae10337eecea2f0b24ed80c84b0409bf6e7b7
--- /dev/null
+++ b/checkpoint-1200/trainer_state.json
@@ -0,0 +1,182 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 44.01,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.00030555555555555555,
+      "loss": 3.9943,
+      "step": 650
+    },
+    {
+      "epoch": 25.02,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.864,
+      "step": 700
+    },
+    {
+      "epoch": 27.02,
+      "learning_rate": 0.00025,
+      "loss": 3.805,
+      "step": 750
+    },
+    {
+      "epoch": 29.01,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 3.726,
+      "step": 800
+    },
+    {
+      "epoch": 31.01,
+      "learning_rate": 0.00019444444444444443,
+      "loss": 3.6562,
+      "step": 850
+    },
+    {
+      "epoch": 33.01,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 3.5922,
+      "step": 900
+    },
+    {
+      "epoch": 35.0,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 3.5355,
+      "step": 950
+    },
+    {
+      "epoch": 37.0,
+      "learning_rate": 0.00011111111111111109,
+      "loss": 3.4861,
+      "step": 1000
+    },
+    {
+      "epoch": 37.0,
+      "eval_loss": 4.404828071594238,
+      "eval_runtime": 4.6328,
+      "eval_samples_per_second": 22.017,
+      "eval_steps_per_second": 1.511,
+      "step": 1000
+    },
+    {
+      "epoch": 37.0,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_loss": 4.404828071594238,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_ppl": 81.84507014102485,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_runtime": 4.6328,
+      "eval_/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_samples_per_second": 22.017,
+      "step": 1000
+    },
+    {
+      "epoch": 38.02,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 3.4188,
+      "step": 1050
+    },
+    {
+      "epoch": 40.02,
+      "learning_rate": 5.5555555555555545e-05,
+      "loss": 3.4056,
+      "step": 1100
+    },
+    {
+      "epoch": 42.01,
+      "learning_rate": 2.7777777777777772e-05,
+      "loss": 3.3751,
+      "step": 1150
+    },
+    {
+      "epoch": 44.01,
+      "learning_rate": 0.0,
+      "loss": 3.3526,
+      "step": 1200
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 3.2521451470848e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-1200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-200/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..65a47e4b85392cac6e5b4a594da8d9e535a1c1bc
--- /dev/null
+++ b/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db9aafe536b0bebe73b60ce84dca49e6a4b56973c5fafb7de008bb0cc119f65b
+size 995659121
diff --git a/checkpoint-200/pytorch_model.bin b/checkpoint-200/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..06308a75a101aa208cffc4e9ac1447f22f67ab66
--- /dev/null
+++ b/checkpoint-200/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac21cfaf6a4c901931f822c8b2e9e1c897e906d9d5bdaa07ae4c6656f8fb63bd
+size 510424169
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..baadda06f4c0bcf303815526515db09f1da94d32
--- /dev/null
+++ b/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca866ec3d619cb6286b5c793c21b348cbc3fdc169864e1888e96c0ca890fb93f
+size 14567
diff --git a/checkpoint-200/scaler.pt b/checkpoint-200/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..365b52ebf376498237a843f6d7332e5a49b14902
--- /dev/null
+++ b/checkpoint-200/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb6982c29cd162f49aeb531674acf574eccd46a8f556bec596040d7c3b95200a
+size 559
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..36cb8da739c80a63971a62f06f781c40ac0fceb2
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a992625ada6d884e508ff9392d16738b4a4163147f8fcbf9f46be82ecae9888
+size 623
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-200/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..00af9015680457b30605cc8c3850c5cb1044a32b
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,46 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 7.009166666666666,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 5.417106407424e+16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-300/config.json b/checkpoint-300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-300/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..102299182506ab1bb49e382bd834411e928951a4
--- /dev/null
+++ b/checkpoint-300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b6569ac66702803817f797a39e0cbedb2db31f20ab5172ca69253f49fb7612f
+size 995659313
diff --git a/checkpoint-300/pytorch_model.bin b/checkpoint-300/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..402dfe326a2dfd002083c6d0a497599f4d3bc562
--- /dev/null
+++ b/checkpoint-300/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a4ab4569350d1d8e1ecae00f5e8424bc7ade8cd02bf6bc93979d0cab7164041
+size 510424169
diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c333d63042753a25fe5a52aa3820c6374e2340ce
--- /dev/null
+++ b/checkpoint-300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5159e17b41aa3e0e5d31e60ceeedc853d820418bf0ad70c1fdcededb7d002a5
+size 14567
diff --git a/checkpoint-300/scaler.pt b/checkpoint-300/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..01066cf4761ea9d2f7962f5181762f7b08690b79
--- /dev/null
+++ b/checkpoint-300/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0967b9f865f16344c55f5ccc3cf7d6e8e97ca61dda304e931ca6bad130f48dd1
+size 559
diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..681987f21e79cd4685aac32a5e6b74341e25d936
--- /dev/null
+++ b/checkpoint-300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1db899b266916f792a0898ceb27a87eaf76647f10c29cc0c13ce22f12a12efd
+size 623
diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-300/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-300/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9731f9843a6c42fc6a08b3edbb8520aa24dbc9b
--- /dev/null
+++ b/checkpoint-300/trainer_state.json
@@ -0,0 +1,58 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 11.0025,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 8.130362867712e+16,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-400/config.json b/checkpoint-400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-400/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0e502ab9495a496d5474a7300cfced3f1c099a8f
--- /dev/null
+++ b/checkpoint-400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f0d2564c822a3f4c89e90ff3d3640fc57114fa9d121d2644591222e22729dca
+size 995659313
diff --git a/checkpoint-400/pytorch_model.bin b/checkpoint-400/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3c6991af72917ca5e9095df896205e0e222d2625
--- /dev/null
+++ b/checkpoint-400/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc7468a532daf74e48a1a5d67d6568613c7d784582bcdda8931001963a911a7a
+size 510424169
diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..05a58a0115450a33201ad62bf6a313a376a23b03
--- /dev/null
+++ b/checkpoint-400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f17bb7197f62145c4e43555bdbb4a016ade5732ce728afa71a32b5e01bbfb6d7
+size 14567
diff --git a/checkpoint-400/scaler.pt b/checkpoint-400/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9c7aef4199e98d81152810b661dbaffc01963383
--- /dev/null
+++ b/checkpoint-400/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:476e510c8ea7edbd2b51d1e76a4e037820a5639381c0d8b5d32dafa492795a1e
+size 559
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..967673cfc91836d239beb5a7ede06992232ba309
--- /dev/null
+++ b/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db087d678047b5c346bbb8511936612c1fdf223c6fd70321e97369bc31ed76a8
+size 623
diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-400/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-400/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1058b0c1100a2644967a9cf24741f066c45ce957
--- /dev/null
+++ b/checkpoint-400/trainer_state.json
@@ -0,0 +1,70 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 14.018333333333333,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 1.0834212814848e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-500/config.json b/checkpoint-500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-500/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91a1df98fe36e5e4db3efab1ad3f15a5ace805b7
--- /dev/null
+++ b/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93367fa629bc6638053d95bfa1ac81fc1361d5bd75717cfa74b321b61941e239
+size 995659313
diff --git a/checkpoint-500/pytorch_model.bin b/checkpoint-500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9a801ddfad3e8f53219ee8dbd44619ad92e86a74
--- /dev/null
+++ b/checkpoint-500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8acaa501f8df1e288deb4082611f7fa527c88a6294f052d489ebba67024cf4e
+size 510424169
diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..46e9cb7d93c4fbe66d2ba798099cf1b5269979c1
--- /dev/null
+++ b/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18bcdf3953d0c79c829d50d6a42474135afcbcecaf547a4aeb1a143d6958bf1e
+size 14567
diff --git a/checkpoint-500/scaler.pt b/checkpoint-500/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e8b96c9c2837f2c95b1d07d4fc3f245f9ad1ef62
--- /dev/null
+++ b/checkpoint-500/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4fa4c7be44c959599b8b43bb9bc3371e9e4e5bbc5758b3ab5afcccfda3e72e67
+size 559
diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..77b958181a5b47b022b96b38a6207f274a1b6604
--- /dev/null
+++ b/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:026fae4a90d56c24de94b10dfa7a75b6ba4e43bd5c1a3fdb2d77356b81cd6f8a
+size 623
diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-500/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-500/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f433f727dae6f7724452679d53369495448ba5d9
--- /dev/null
+++ b/checkpoint-500/trainer_state.json
@@ -0,0 +1,82 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 18.011666666666667,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 1.3547469275136e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-600/config.json b/checkpoint-600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-600/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..36e832873e34e3b6bd894754dde2c113c596a4ef
--- /dev/null
+++ b/checkpoint-600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9103ad2ce6ccbc0e5f8b481fcd9cc516358a96122f8e6dde3662e4eddd3a11d1
+size 995659313
diff --git a/checkpoint-600/pytorch_model.bin b/checkpoint-600/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2c50d8d2bd817c7133aaabe85af4b532602b8c53
--- /dev/null
+++ b/checkpoint-600/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4709fddfad018d2834559aee144a4eb929a1cfab5539e5dddfe6b4ea94644f5e
+size 510424169
diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a972e30975c02afd9fcf899e0933151b14a18c76
--- /dev/null
+++ b/checkpoint-600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe5dc14709ee2c62ca4726a432da23b370929e34a86df5df86098ddbeb109ce
+size 14567
diff --git a/checkpoint-600/scaler.pt b/checkpoint-600/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f95b3e36da01561ec333a83ee8419ad225633e06
--- /dev/null
+++ b/checkpoint-600/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e8415b86bbce347c0df306b84a695add049c2a3b2d0b6f4dda3bf036d341150
+size 559
diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..62e5359badd619e7dad2c51d98fa8043d9948f0b
--- /dev/null
+++ b/checkpoint-600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:700940432b1c2117248896e2ce5a58d93c051d92ea97707f74d76bf1ef24deee
+size 623
diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-600/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-600/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cad78f2d3bc3f14fb953570d7ae59bce067e615
--- /dev/null
+++ b/checkpoint-600/trainer_state.json
@@ -0,0 +1,94 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 22.005,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 1.6260725735424e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-700/config.json b/checkpoint-700/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-700/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..91bf3d099b0185c761651f6b26772b625d138283
--- /dev/null
+++ b/checkpoint-700/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1548dd45ce27d781bf4e92eb80148179eb01b37d414a91dc050c08ae7196612
+size 995659313
diff --git a/checkpoint-700/pytorch_model.bin b/checkpoint-700/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a3e27acbe2d3d30c7b81a43997560eb0587410af
--- /dev/null
+++ b/checkpoint-700/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dff5f3647571bbcd7a21e17516fb7868fb8b4ec83f073f63f5a00bbe5dff3106
+size 510424169
diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ac9b4d8d7a505e6d87d6006b82a3a76862bb6dc2
--- /dev/null
+++ b/checkpoint-700/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a68b29aa28e86e9a5a1e0badb5ef9c5f3c44042fa2b57fdf1e62edf0a046ca2
+size 14567
diff --git a/checkpoint-700/scaler.pt b/checkpoint-700/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..12f2b8ec834e54a2bd7cfdd0e07b0c6e125b6490
--- /dev/null
+++ b/checkpoint-700/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fb213daf5cce18a5f92167ca14da9df084d907f2b9796efc4666630f312b58c
+size 559
diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..42408b6253265af34ae78746144fbba9316e0d7e
--- /dev/null
+++ b/checkpoint-700/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2d4910fd408e002ebeff50d62bfb043dcae5ef658777d0c3ee4a3bbb515ec15
+size 623
diff --git a/checkpoint-700/special_tokens_map.json b/checkpoint-700/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-700/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-700/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..82499559a64bf3640dbceaf596e9c2e5230b34c8
--- /dev/null
+++ b/checkpoint-700/trainer_state.json
@@ -0,0 +1,106 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 25.020833333333332,
+  "global_step": 700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.00030555555555555555,
+      "loss": 3.9943,
+      "step": 650
+    },
+    {
+      "epoch": 25.02,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.864,
+      "step": 700
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 1.896457568256e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-700/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-800/config.json b/checkpoint-800/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-800/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0cbf342dfec339d41ca7dd2a6339702177b8ddf5
--- /dev/null
+++ b/checkpoint-800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13a9680eb7b6a8f753f0b49d728cf438f979355321619f6ed8c8f9ab2be319bf
+size 995659313
diff --git a/checkpoint-800/pytorch_model.bin b/checkpoint-800/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8293d1051d71f7dad723c39a08f6ff9c46d781a1
--- /dev/null
+++ b/checkpoint-800/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27068797376455347259aaa7e9372f8d404a558189de7ef22c03b37a6058a439
+size 510424169
diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..916e3ce83ffb586ad69b6b7359defd155d237872
--- /dev/null
+++ b/checkpoint-800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:439cfc743700f3fc3b82def915edb4a240667c230d5acc6680f419fd08826e71
+size 14567
diff --git a/checkpoint-800/scaler.pt b/checkpoint-800/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3c73372264156b02df8dada2192ee3c96dd5fc4
--- /dev/null
+++ b/checkpoint-800/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c2074cdcefbaa0a39f736d6b0f7bf018c350d49e85648bc8accc4f756ad816e
+size 559
diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75713dbce3771306ca00343ecc497c4f19a01d03
--- /dev/null
+++ b/checkpoint-800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b27fb255c84833fb6ab5d93679cb236a569de9a1c4f805f72a2f60a2bc7c7499
+size 623
diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-800/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-800/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bcb53ffdc63847f20e6bc8c6e1849ca804853cdd
--- /dev/null
+++ b/checkpoint-800/trainer_state.json
@@ -0,0 +1,118 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 29.014166666666668,
+  "global_step": 800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.00030555555555555555,
+      "loss": 3.9943,
+      "step": 650
+    },
+    {
+      "epoch": 25.02,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.864,
+      "step": 700
+    },
+    {
+      "epoch": 27.02,
+      "learning_rate": 0.00025,
+      "loss": 3.805,
+      "step": 750
+    },
+    {
+      "epoch": 29.01,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 3.726,
+      "step": 800
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 2.1677832142848e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/checkpoint-900/config.json b/checkpoint-900/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/checkpoint-900/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a031154a988284fb65f15ee6d40848da3b7d2f15
--- /dev/null
+++ b/checkpoint-900/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a5ef4f46b4f0cab31045eba8cfeb63594c65041da6f7867abaa4878fb8a8e9b
+size 995659313
diff --git a/checkpoint-900/pytorch_model.bin b/checkpoint-900/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..12e399d64d585fd6b46921057781ab63d2362aee
--- /dev/null
+++ b/checkpoint-900/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f925319e54b0aef0207033c4eb836c0eb914dc8fd51b1731e74491d101e837e1
+size 510424169
diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a87122eef8205998fa37471ed8f1e0561b8df855
--- /dev/null
+++ b/checkpoint-900/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c76fa569e1db3795d2e8cd1443d34b1d291e3facb4e37630f0ba4808a6c524e6
+size 14567
diff --git a/checkpoint-900/scaler.pt b/checkpoint-900/scaler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..be54cb13c777bc6feccb478ff218e7e21fad482a
--- /dev/null
+++ b/checkpoint-900/scaler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8695f57df923e22b943b0b0f2b9cc7007008e80b53ccee275b3a35963fe67e9
+size 559
diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..539d7c83ea252818dc9cbffac08cf340bb05a454
--- /dev/null
+++ b/checkpoint-900/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2cb09e5db72772a15094286e93cbb61d745d9b63863703cf53da0bcb9827821
+size 623
diff --git a/checkpoint-900/special_tokens_map.json b/checkpoint-900/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/checkpoint-900/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/checkpoint-900/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca933e0a20b4ed79a9dbad67fab9511683dc591f
--- /dev/null
+++ b/checkpoint-900/trainer_state.json
@@ -0,0 +1,130 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 33.0075,
+  "global_step": 900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "learning_rate": 4.9999999999999996e-06,
+      "loss": 10.9216,
+      "step": 1
+    },
+    {
+      "epoch": 1.02,
+      "learning_rate": 0.00025,
+      "loss": 8.2551,
+      "step": 50
+    },
+    {
+      "epoch": 3.02,
+      "learning_rate": 0.0005,
+      "loss": 6.5164,
+      "step": 100
+    },
+    {
+      "epoch": 5.01,
+      "learning_rate": 0.0005833333333333333,
+      "loss": 5.8905,
+      "step": 150
+    },
+    {
+      "epoch": 7.01,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 5.5027,
+      "step": 200
+    },
+    {
+      "epoch": 9.01,
+      "learning_rate": 0.0005277777777777777,
+      "loss": 5.2583,
+      "step": 250
+    },
+    {
+      "epoch": 11.0,
+      "learning_rate": 0.0005,
+      "loss": 5.09,
+      "step": 300
+    },
+    {
+      "epoch": 12.02,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 4.9223,
+      "step": 350
+    },
+    {
+      "epoch": 14.02,
+      "learning_rate": 0.00044444444444444436,
+      "loss": 4.7935,
+      "step": 400
+    },
+    {
+      "epoch": 16.02,
+      "learning_rate": 0.00041666666666666664,
+      "loss": 4.6037,
+      "step": 450
+    },
+    {
+      "epoch": 18.01,
+      "learning_rate": 0.00038888888888888887,
+      "loss": 4.4024,
+      "step": 500
+    },
+    {
+      "epoch": 20.01,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 4.2409,
+      "step": 550
+    },
+    {
+      "epoch": 22.0,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 4.1107,
+      "step": 600
+    },
+    {
+      "epoch": 24.0,
+      "learning_rate": 0.00030555555555555555,
+      "loss": 3.9943,
+      "step": 650
+    },
+    {
+      "epoch": 25.02,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 3.864,
+      "step": 700
+    },
+    {
+      "epoch": 27.02,
+      "learning_rate": 0.00025,
+      "loss": 3.805,
+      "step": 750
+    },
+    {
+      "epoch": 29.01,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 3.726,
+      "step": 800
+    },
+    {
+      "epoch": 31.01,
+      "learning_rate": 0.00019444444444444443,
+      "loss": 3.6562,
+      "step": 850
+    },
+    {
+      "epoch": 33.01,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 3.5922,
+      "step": 900
+    }
+  ],
+  "max_steps": 1200,
+  "num_train_epochs": 9223372036854775807,
+  "total_flos": 2.4391088603136e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/checkpoint-900/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e114cdbfd4f4b4eb1d5d2470a255e80109712fc
--- /dev/null
+++ b/config.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50265,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50265,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.2,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 1024
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.18.0",
+  "use_cache": false,
+  "vocab_size": 50266
+}
diff --git a/metrics.json b/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bb5b51a1536f80212f4d5595ff5d3c00ca44d4c
--- /dev/null
+++ b/metrics.json
@@ -0,0 +1,2505 @@
+{"num_parameters": 124446720, "trainable_parameters": 124446720, "step": 0}
+{"train_info/time_between_train_steps": 3.7133238315582275, "step": 0}
+{"info/global_step": 1, "train_info/time_within_train_step": 27.901622772216797, "step": 1}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 19763.01171875, "train_info/memory_reserved": 22624.0, "train_info/memory_max_reserved": 22624.0, "_timestamp": 1734009585, "_runtime": 36}, "step": 1}
+{"logs": {"train/loss": 10.9216, "train/learning_rate": 4.9999999999999996e-06, "train/epoch": 0.0, "_timestamp": 1734009585, "_runtime": 36}, "step": 1}
+{"train_info/time_between_train_steps": 0.01087808609008789, "step": 1}
+{"info/global_step": 2, "train_info/time_within_train_step": 27.584157943725586, "step": 2}
+{"train_info/time_between_train_steps": 0.006932497024536133, "step": 2}
+{"info/global_step": 3, "train_info/time_within_train_step": 27.31800150871277, "step": 3}
+{"train_info/time_between_train_steps": 0.006711721420288086, "step": 3}
+{"info/global_step": 4, "train_info/time_within_train_step": 27.453080415725708, "step": 4}
+{"train_info/time_between_train_steps": 0.0065729618072509766, "step": 4}
+{"info/global_step": 5, "train_info/time_within_train_step": 27.419596433639526, "step": 5}
+{"train_info/time_between_train_steps": 0.006734609603881836, "step": 5}
+{"info/global_step": 6, "train_info/time_within_train_step": 27.432409524917603, "step": 6}
+{"train_info/time_between_train_steps": 0.006474733352661133, "step": 6}
+{"info/global_step": 7, "train_info/time_within_train_step": 27.307039260864258, "step": 7}
+{"train_info/time_between_train_steps": 0.006439924240112305, "step": 7}
+{"info/global_step": 8, "train_info/time_within_train_step": 27.34648585319519, "step": 8}
+{"train_info/time_between_train_steps": 0.006218910217285156, "step": 8}
+{"info/global_step": 9, "train_info/time_within_train_step": 27.285165548324585, "step": 9}
+{"train_info/time_between_train_steps": 0.005903720855712891, "step": 9}
+{"info/global_step": 10, "train_info/time_within_train_step": 27.285354375839233, "step": 10}
+{"train_info/time_between_train_steps": 0.006175994873046875, "step": 10}
+{"info/global_step": 11, "train_info/time_within_train_step": 27.295811414718628, "step": 11}
+{"train_info/time_between_train_steps": 0.005781412124633789, "step": 11}
+{"info/global_step": 12, "train_info/time_within_train_step": 27.26561999320984, "step": 12}
+{"train_info/time_between_train_steps": 0.00613856315612793, "step": 12}
+{"info/global_step": 13, "train_info/time_within_train_step": 27.286527156829834, "step": 13}
+{"train_info/time_between_train_steps": 0.005697488784790039, "step": 13}
+{"info/global_step": 14, "train_info/time_within_train_step": 27.340028047561646, "step": 14}
+{"train_info/time_between_train_steps": 0.006165981292724609, "step": 14}
+{"info/global_step": 15, "train_info/time_within_train_step": 27.266886711120605, "step": 15}
+{"train_info/time_between_train_steps": 0.005937337875366211, "step": 15}
+{"info/global_step": 16, "train_info/time_within_train_step": 27.40067481994629, "step": 16}
+{"train_info/time_between_train_steps": 0.006292819976806641, "step": 16}
+{"info/global_step": 17, "train_info/time_within_train_step": 27.25371742248535, "step": 17}
+{"train_info/time_between_train_steps": 0.006036996841430664, "step": 17}
+{"info/global_step": 18, "train_info/time_within_train_step": 27.25603485107422, "step": 18}
+{"train_info/time_between_train_steps": 0.005736351013183594, "step": 18}
+{"info/global_step": 19, "train_info/time_within_train_step": 27.298856019973755, "step": 19}
+{"train_info/time_between_train_steps": 0.005998373031616211, "step": 19}
+{"info/global_step": 20, "train_info/time_within_train_step": 27.308612823486328, "step": 20}
+{"train_info/time_between_train_steps": 0.005865573883056641, "step": 20}
+{"info/global_step": 21, "train_info/time_within_train_step": 27.258298873901367, "step": 21}
+{"train_info/time_between_train_steps": 0.008790016174316406, "step": 21}
+{"info/global_step": 22, "train_info/time_within_train_step": 27.287286520004272, "step": 22}
+{"train_info/time_between_train_steps": 0.0061948299407958984, "step": 22}
+{"info/global_step": 23, "train_info/time_within_train_step": 27.256081342697144, "step": 23}
+{"train_info/time_between_train_steps": 0.005888223648071289, "step": 23}
+{"info/global_step": 24, "train_info/time_within_train_step": 27.27766489982605, "step": 24}
+{"train_info/time_between_train_steps": 0.006098508834838867, "step": 24}
+{"info/global_step": 25, "train_info/time_within_train_step": 27.279732704162598, "step": 25}
+{"train_info/time_between_train_steps": 0.00668644905090332, "step": 25}
+{"info/global_step": 26, "train_info/time_within_train_step": 27.336434364318848, "step": 26}
+{"train_info/time_between_train_steps": 0.006532430648803711, "step": 26}
+{"info/global_step": 27, "train_info/time_within_train_step": 27.338104009628296, "step": 27}
+{"train_info/time_between_train_steps": 0.007265567779541016, "step": 27}
+{"train_info/time_between_train_steps": 12.959235191345215, "step": 27}
+{"info/global_step": 28, "train_info/time_within_train_step": 27.302714347839355, "step": 28}
+{"train_info/time_between_train_steps": 0.0060045719146728516, "step": 28}
+{"info/global_step": 29, "train_info/time_within_train_step": 27.412619590759277, "step": 29}
+{"train_info/time_between_train_steps": 0.005700111389160156, "step": 29}
+{"info/global_step": 30, "train_info/time_within_train_step": 27.29810070991516, "step": 30}
+{"train_info/time_between_train_steps": 0.006146907806396484, "step": 30}
+{"info/global_step": 31, "train_info/time_within_train_step": 27.550361394882202, "step": 31}
+{"train_info/time_between_train_steps": 0.005604743957519531, "step": 31}
+{"info/global_step": 32, "train_info/time_within_train_step": 27.293407440185547, "step": 32}
+{"train_info/time_between_train_steps": 0.005837440490722656, "step": 32}
+{"info/global_step": 33, "train_info/time_within_train_step": 27.47903323173523, "step": 33}
+{"train_info/time_between_train_steps": 0.005644559860229492, "step": 33}
+{"info/global_step": 34, "train_info/time_within_train_step": 27.267890691757202, "step": 34}
+{"train_info/time_between_train_steps": 0.0056705474853515625, "step": 34}
+{"info/global_step": 35, "train_info/time_within_train_step": 27.850106954574585, "step": 35}
+{"train_info/time_between_train_steps": 0.005257368087768555, "step": 35}
+{"info/global_step": 36, "train_info/time_within_train_step": 27.23758888244629, "step": 36}
+{"train_info/time_between_train_steps": 0.005394935607910156, "step": 36}
+{"info/global_step": 37, "train_info/time_within_train_step": 27.261752605438232, "step": 37}
+{"train_info/time_between_train_steps": 0.00528407096862793, "step": 37}
+{"info/global_step": 38, "train_info/time_within_train_step": 27.34846806526184, "step": 38}
+{"train_info/time_between_train_steps": 0.005433797836303711, "step": 38}
+{"info/global_step": 39, "train_info/time_within_train_step": 27.230044841766357, "step": 39}
+{"train_info/time_between_train_steps": 0.01087331771850586, "step": 39}
+{"info/global_step": 40, "train_info/time_within_train_step": 27.232797384262085, "step": 40}
+{"train_info/time_between_train_steps": 0.0054852962493896484, "step": 40}
+{"info/global_step": 41, "train_info/time_within_train_step": 27.234292030334473, "step": 41}
+{"train_info/time_between_train_steps": 0.005366802215576172, "step": 41}
+{"info/global_step": 42, "train_info/time_within_train_step": 27.22655487060547, "step": 42}
+{"train_info/time_between_train_steps": 0.005409717559814453, "step": 42}
+{"info/global_step": 43, "train_info/time_within_train_step": 27.222636699676514, "step": 43}
+{"train_info/time_between_train_steps": 0.005441188812255859, "step": 43}
+{"info/global_step": 44, "train_info/time_within_train_step": 27.423120498657227, "step": 44}
+{"train_info/time_between_train_steps": 0.009590625762939453, "step": 44}
+{"info/global_step": 45, "train_info/time_within_train_step": 27.22589135169983, "step": 45}
+{"train_info/time_between_train_steps": 0.005521297454833984, "step": 45}
+{"info/global_step": 46, "train_info/time_within_train_step": 27.22916865348816, "step": 46}
+{"train_info/time_between_train_steps": 0.010180234909057617, "step": 46}
+{"info/global_step": 47, "train_info/time_within_train_step": 27.285945415496826, "step": 47}
+{"train_info/time_between_train_steps": 0.005359172821044922, "step": 47}
+{"info/global_step": 48, "train_info/time_within_train_step": 27.240264415740967, "step": 48}
+{"train_info/time_between_train_steps": 0.005252838134765625, "step": 48}
+{"info/global_step": 49, "train_info/time_within_train_step": 27.21561336517334, "step": 49}
+{"train_info/time_between_train_steps": 0.005248546600341797, "step": 49}
+{"info/global_step": 50, "train_info/time_within_train_step": 27.222008228302002, "step": 50}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734010937, "_runtime": 1388}, "step": 50}
+{"logs": {"train/loss": 8.2551, "train/learning_rate": 0.00025, "train/epoch": 1.02, "_timestamp": 1734010937, "_runtime": 1388}, "step": 50}
+{"train_info/time_between_train_steps": 0.008126497268676758, "step": 50}
+{"info/global_step": 51, "train_info/time_within_train_step": 27.24115538597107, "step": 51}
+{"train_info/time_between_train_steps": 0.005240917205810547, "step": 51}
+{"info/global_step": 52, "train_info/time_within_train_step": 27.232333183288574, "step": 52}
+{"train_info/time_between_train_steps": 0.005731344223022461, "step": 52}
+{"info/global_step": 53, "train_info/time_within_train_step": 27.259469270706177, "step": 53}
+{"train_info/time_between_train_steps": 0.00571894645690918, "step": 53}
+{"info/global_step": 54, "train_info/time_within_train_step": 27.257779121398926, "step": 54}
+{"train_info/time_between_train_steps": 0.006444215774536133, "step": 54}
+{"train_info/time_between_train_steps": 13.086959600448608, "step": 54}
+{"info/global_step": 55, "train_info/time_within_train_step": 27.26534128189087, "step": 55}
+{"train_info/time_between_train_steps": 0.00545048713684082, "step": 55}
+{"info/global_step": 56, "train_info/time_within_train_step": 27.370739221572876, "step": 56}
+{"train_info/time_between_train_steps": 0.0053026676177978516, "step": 56}
+{"info/global_step": 57, "train_info/time_within_train_step": 27.21298599243164, "step": 57}
+{"train_info/time_between_train_steps": 0.005305290222167969, "step": 57}
+{"info/global_step": 58, "train_info/time_within_train_step": 27.33854055404663, "step": 58}
+{"train_info/time_between_train_steps": 0.005473136901855469, "step": 58}
+{"info/global_step": 59, "train_info/time_within_train_step": 27.216068744659424, "step": 59}
+{"train_info/time_between_train_steps": 0.00535893440246582, "step": 59}
+{"info/global_step": 60, "train_info/time_within_train_step": 27.312793016433716, "step": 60}
+{"train_info/time_between_train_steps": 0.0053141117095947266, "step": 60}
+{"info/global_step": 61, "train_info/time_within_train_step": 27.226682424545288, "step": 61}
+{"train_info/time_between_train_steps": 0.005443096160888672, "step": 61}
+{"info/global_step": 62, "train_info/time_within_train_step": 27.348260164260864, "step": 62}
+{"train_info/time_between_train_steps": 0.005154848098754883, "step": 62}
+{"info/global_step": 63, "train_info/time_within_train_step": 27.24720025062561, "step": 63}
+{"train_info/time_between_train_steps": 0.005398988723754883, "step": 63}
+{"info/global_step": 64, "train_info/time_within_train_step": 27.192039966583252, "step": 64}
+{"train_info/time_between_train_steps": 0.00513768196105957, "step": 64}
+{"info/global_step": 65, "train_info/time_within_train_step": 27.235431671142578, "step": 65}
+{"train_info/time_between_train_steps": 0.0052661895751953125, "step": 65}
+{"info/global_step": 66, "train_info/time_within_train_step": 27.227601289749146, "step": 66}
+{"train_info/time_between_train_steps": 0.0056896209716796875, "step": 66}
+{"info/global_step": 67, "train_info/time_within_train_step": 27.197728157043457, "step": 67}
+{"train_info/time_between_train_steps": 0.005370140075683594, "step": 67}
+{"info/global_step": 68, "train_info/time_within_train_step": 27.246623992919922, "step": 68}
+{"train_info/time_between_train_steps": 0.00548243522644043, "step": 68}
+{"info/global_step": 69, "train_info/time_within_train_step": 27.284056901931763, "step": 69}
+{"train_info/time_between_train_steps": 0.005324125289916992, "step": 69}
+{"info/global_step": 70, "train_info/time_within_train_step": 27.34241485595703, "step": 70}
+{"train_info/time_between_train_steps": 0.017647981643676758, "step": 70}
+{"info/global_step": 71, "train_info/time_within_train_step": 27.21190333366394, "step": 71}
+{"train_info/time_between_train_steps": 0.005394935607910156, "step": 71}
+{"info/global_step": 72, "train_info/time_within_train_step": 28.22646403312683, "step": 72}
+{"train_info/time_between_train_steps": 0.0053768157958984375, "step": 72}
+{"info/global_step": 73, "train_info/time_within_train_step": 27.224189043045044, "step": 73}
+{"train_info/time_between_train_steps": 0.012000799179077148, "step": 73}
+{"info/global_step": 74, "train_info/time_within_train_step": 28.631621599197388, "step": 74}
+{"train_info/time_between_train_steps": 0.005431413650512695, "step": 74}
+{"info/global_step": 75, "train_info/time_within_train_step": 27.30460786819458, "step": 75}
+{"train_info/time_between_train_steps": 0.005465269088745117, "step": 75}
+{"info/global_step": 76, "train_info/time_within_train_step": 27.352893829345703, "step": 76}
+{"train_info/time_between_train_steps": 0.013202428817749023, "step": 76}
+{"info/global_step": 77, "train_info/time_within_train_step": 27.35923957824707, "step": 77}
+{"train_info/time_between_train_steps": 0.009780168533325195, "step": 77}
+{"info/global_step": 78, "train_info/time_within_train_step": 27.453487873077393, "step": 78}
+{"train_info/time_between_train_steps": 0.0054514408111572266, "step": 78}
+{"info/global_step": 79, "train_info/time_within_train_step": 27.30955719947815, "step": 79}
+{"train_info/time_between_train_steps": 0.010524988174438477, "step": 79}
+{"info/global_step": 80, "train_info/time_within_train_step": 27.36312508583069, "step": 80}
+{"train_info/time_between_train_steps": 0.005414247512817383, "step": 80}
+{"info/global_step": 81, "train_info/time_within_train_step": 27.304611444473267, "step": 81}
+{"train_info/time_between_train_steps": 0.0062084197998046875, "step": 81}
+{"train_info/time_between_train_steps": 13.204168319702148, "step": 81}
+{"info/global_step": 82, "train_info/time_within_train_step": 27.248695850372314, "step": 82}
+{"train_info/time_between_train_steps": 0.0050585269927978516, "step": 82}
+{"info/global_step": 83, "train_info/time_within_train_step": 27.358758211135864, "step": 83}
+{"train_info/time_between_train_steps": 0.005247354507446289, "step": 83}
+{"info/global_step": 84, "train_info/time_within_train_step": 27.31750202178955, "step": 84}
+{"train_info/time_between_train_steps": 0.005296945571899414, "step": 84}
+{"info/global_step": 85, "train_info/time_within_train_step": 27.445859670639038, "step": 85}
+{"train_info/time_between_train_steps": 0.005919694900512695, "step": 85}
+{"info/global_step": 86, "train_info/time_within_train_step": 27.35288429260254, "step": 86}
+{"train_info/time_between_train_steps": 0.010787248611450195, "step": 86}
+{"info/global_step": 87, "train_info/time_within_train_step": 27.603801250457764, "step": 87}
+{"train_info/time_between_train_steps": 0.0056915283203125, "step": 87}
+{"info/global_step": 88, "train_info/time_within_train_step": 27.397591590881348, "step": 88}
+{"train_info/time_between_train_steps": 0.006037235260009766, "step": 88}
+{"info/global_step": 89, "train_info/time_within_train_step": 27.465806245803833, "step": 89}
+{"train_info/time_between_train_steps": 0.012233495712280273, "step": 89}
+{"info/global_step": 90, "train_info/time_within_train_step": 27.347827434539795, "step": 90}
+{"train_info/time_between_train_steps": 0.00532984733581543, "step": 90}
+{"info/global_step": 91, "train_info/time_within_train_step": 27.22930407524109, "step": 91}
+{"train_info/time_between_train_steps": 0.005345821380615234, "step": 91}
+{"info/global_step": 92, "train_info/time_within_train_step": 27.32673740386963, "step": 92}
+{"train_info/time_between_train_steps": 0.0054514408111572266, "step": 92}
+{"info/global_step": 93, "train_info/time_within_train_step": 28.109936952590942, "step": 93}
+{"train_info/time_between_train_steps": 0.005641460418701172, "step": 93}
+{"info/global_step": 94, "train_info/time_within_train_step": 27.28028631210327, "step": 94}
+{"train_info/time_between_train_steps": 0.007003068923950195, "step": 94}
+{"info/global_step": 95, "train_info/time_within_train_step": 27.442970752716064, "step": 95}
+{"train_info/time_between_train_steps": 0.008707284927368164, "step": 95}
+{"info/global_step": 96, "train_info/time_within_train_step": 27.692561626434326, "step": 96}
+{"train_info/time_between_train_steps": 0.005312204360961914, "step": 96}
+{"info/global_step": 97, "train_info/time_within_train_step": 27.586622953414917, "step": 97}
+{"train_info/time_between_train_steps": 0.007033586502075195, "step": 97}
+{"info/global_step": 98, "train_info/time_within_train_step": 27.236198663711548, "step": 98}
+{"train_info/time_between_train_steps": 0.005651950836181641, "step": 98}
+{"info/global_step": 99, "train_info/time_within_train_step": 27.377466678619385, "step": 99}
+{"train_info/time_between_train_steps": 0.005448102951049805, "step": 99}
+{"info/global_step": 100, "train_info/time_within_train_step": 27.276402473449707, "step": 100}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734012333, "_runtime": 2784}, "step": 100}
+{"logs": {"train/loss": 6.5164, "train/learning_rate": 0.0005, "train/epoch": 3.02, "_timestamp": 1734012333, "_runtime": 2784}, "step": 100}
+{"train_info/time_between_train_steps": 2.4337470531463623, "step": 100}
+{"info/global_step": 101, "train_info/time_within_train_step": 27.245219707489014, "step": 101}
+{"train_info/time_between_train_steps": 0.010434389114379883, "step": 101}
+{"info/global_step": 102, "train_info/time_within_train_step": 27.20097780227661, "step": 102}
+{"train_info/time_between_train_steps": 0.005671262741088867, "step": 102}
+{"info/global_step": 103, "train_info/time_within_train_step": 27.319756031036377, "step": 103}
+{"train_info/time_between_train_steps": 0.010403156280517578, "step": 103}
+{"info/global_step": 104, "train_info/time_within_train_step": 27.310544967651367, "step": 104}
+{"train_info/time_between_train_steps": 0.005731105804443359, "step": 104}
+{"info/global_step": 105, "train_info/time_within_train_step": 27.428560972213745, "step": 105}
+{"train_info/time_between_train_steps": 0.012356281280517578, "step": 105}
+{"info/global_step": 106, "train_info/time_within_train_step": 27.457520246505737, "step": 106}
+{"train_info/time_between_train_steps": 0.007974386215209961, "step": 106}
+{"info/global_step": 107, "train_info/time_within_train_step": 27.4455988407135, "step": 107}
+{"train_info/time_between_train_steps": 0.007240772247314453, "step": 107}
+{"info/global_step": 108, "train_info/time_within_train_step": 27.558592557907104, "step": 108}
+{"train_info/time_between_train_steps": 0.007956266403198242, "step": 108}
+{"train_info/time_between_train_steps": 13.330041408538818, "step": 108}
+{"info/global_step": 109, "train_info/time_within_train_step": 27.380130767822266, "step": 109}
+{"train_info/time_between_train_steps": 0.012279987335205078, "step": 109}
+{"info/global_step": 110, "train_info/time_within_train_step": 27.545589208602905, "step": 110}
+{"train_info/time_between_train_steps": 0.005964756011962891, "step": 110}
+{"info/global_step": 111, "train_info/time_within_train_step": 27.3411967754364, "step": 111}
+{"train_info/time_between_train_steps": 0.00608372688293457, "step": 111}
+{"info/global_step": 112, "train_info/time_within_train_step": 27.51325273513794, "step": 112}
+{"train_info/time_between_train_steps": 0.016414165496826172, "step": 112}
+{"info/global_step": 113, "train_info/time_within_train_step": 27.366541862487793, "step": 113}
+{"train_info/time_between_train_steps": 0.011642217636108398, "step": 113}
+{"info/global_step": 114, "train_info/time_within_train_step": 27.624361276626587, "step": 114}
+{"train_info/time_between_train_steps": 0.005833625793457031, "step": 114}
+{"info/global_step": 115, "train_info/time_within_train_step": 27.57137131690979, "step": 115}
+{"train_info/time_between_train_steps": 0.006321430206298828, "step": 115}
+{"info/global_step": 116, "train_info/time_within_train_step": 27.360617876052856, "step": 116}
+{"train_info/time_between_train_steps": 0.00555109977722168, "step": 116}
+{"info/global_step": 117, "train_info/time_within_train_step": 27.355725288391113, "step": 117}
+{"train_info/time_between_train_steps": 0.005721569061279297, "step": 117}
+{"info/global_step": 118, "train_info/time_within_train_step": 27.24662494659424, "step": 118}
+{"train_info/time_between_train_steps": 0.009737491607666016, "step": 118}
+{"info/global_step": 119, "train_info/time_within_train_step": 27.43115472793579, "step": 119}
+{"train_info/time_between_train_steps": 0.005687236785888672, "step": 119}
+{"info/global_step": 120, "train_info/time_within_train_step": 27.423017501831055, "step": 120}
+{"train_info/time_between_train_steps": 0.00884866714477539, "step": 120}
+{"info/global_step": 121, "train_info/time_within_train_step": 27.395385265350342, "step": 121}
+{"train_info/time_between_train_steps": 0.005854129791259766, "step": 121}
+{"info/global_step": 122, "train_info/time_within_train_step": 27.418846607208252, "step": 122}
+{"train_info/time_between_train_steps": 0.009710073471069336, "step": 122}
+{"info/global_step": 123, "train_info/time_within_train_step": 27.285964488983154, "step": 123}
+{"train_info/time_between_train_steps": 0.005699872970581055, "step": 123}
+{"info/global_step": 124, "train_info/time_within_train_step": 27.37974524497986, "step": 124}
+{"train_info/time_between_train_steps": 0.005486488342285156, "step": 124}
+{"info/global_step": 125, "train_info/time_within_train_step": 27.293459177017212, "step": 125}
+{"train_info/time_between_train_steps": 0.0056438446044921875, "step": 125}
+{"info/global_step": 126, "train_info/time_within_train_step": 27.383872509002686, "step": 126}
+{"train_info/time_between_train_steps": 0.005425930023193359, "step": 126}
+{"info/global_step": 127, "train_info/time_within_train_step": 27.36147928237915, "step": 127}
+{"train_info/time_between_train_steps": 0.0055925846099853516, "step": 127}
+{"info/global_step": 128, "train_info/time_within_train_step": 27.28618025779724, "step": 128}
+{"train_info/time_between_train_steps": 0.005793094635009766, "step": 128}
+{"info/global_step": 129, "train_info/time_within_train_step": 27.332168102264404, "step": 129}
+{"train_info/time_between_train_steps": 0.011013269424438477, "step": 129}
+{"info/global_step": 130, "train_info/time_within_train_step": 27.396401405334473, "step": 130}
+{"train_info/time_between_train_steps": 0.014610767364501953, "step": 130}
+{"info/global_step": 131, "train_info/time_within_train_step": 27.39585280418396, "step": 131}
+{"train_info/time_between_train_steps": 0.0056188106536865234, "step": 131}
+{"info/global_step": 132, "train_info/time_within_train_step": 27.411385536193848, "step": 132}
+{"train_info/time_between_train_steps": 0.0057370662689208984, "step": 132}
+{"info/global_step": 133, "train_info/time_within_train_step": 27.3404061794281, "step": 133}
+{"train_info/time_between_train_steps": 0.01804828643798828, "step": 133}
+{"info/global_step": 134, "train_info/time_within_train_step": 27.431970596313477, "step": 134}
+{"train_info/time_between_train_steps": 0.005845308303833008, "step": 134}
+{"info/global_step": 135, "train_info/time_within_train_step": 27.396618843078613, "step": 135}
+{"train_info/time_between_train_steps": 0.00644230842590332, "step": 135}
+{"train_info/time_between_train_steps": 13.170060157775879, "step": 135}
+{"info/global_step": 136, "train_info/time_within_train_step": 27.39639163017273, "step": 136}
+{"train_info/time_between_train_steps": 0.0206758975982666, "step": 136}
+{"info/global_step": 137, "train_info/time_within_train_step": 27.59170699119568, "step": 137}
+{"train_info/time_between_train_steps": 0.010412931442260742, "step": 137}
+{"info/global_step": 138, "train_info/time_within_train_step": 27.459978103637695, "step": 138}
+{"train_info/time_between_train_steps": 0.00572967529296875, "step": 138}
+{"info/global_step": 139, "train_info/time_within_train_step": 27.67072343826294, "step": 139}
+{"train_info/time_between_train_steps": 0.009820699691772461, "step": 139}
+{"info/global_step": 140, "train_info/time_within_train_step": 27.368707418441772, "step": 140}
+{"train_info/time_between_train_steps": 0.005986690521240234, "step": 140}
+{"info/global_step": 141, "train_info/time_within_train_step": 27.568739652633667, "step": 141}
+{"train_info/time_between_train_steps": 0.005504608154296875, "step": 141}
+{"info/global_step": 142, "train_info/time_within_train_step": 27.41534972190857, "step": 142}
+{"train_info/time_between_train_steps": 0.0061147212982177734, "step": 142}
+{"info/global_step": 143, "train_info/time_within_train_step": 27.388182163238525, "step": 143}
+{"train_info/time_between_train_steps": 0.005227327346801758, "step": 143}
+{"info/global_step": 144, "train_info/time_within_train_step": 27.333370208740234, "step": 144}
+{"train_info/time_between_train_steps": 0.0093536376953125, "step": 144}
+{"info/global_step": 145, "train_info/time_within_train_step": 27.37626576423645, "step": 145}
+{"train_info/time_between_train_steps": 0.0055620670318603516, "step": 145}
+{"info/global_step": 146, "train_info/time_within_train_step": 27.37166166305542, "step": 146}
+{"train_info/time_between_train_steps": 0.010332107543945312, "step": 146}
+{"info/global_step": 147, "train_info/time_within_train_step": 27.356934309005737, "step": 147}
+{"train_info/time_between_train_steps": 0.005283355712890625, "step": 147}
+{"info/global_step": 148, "train_info/time_within_train_step": 27.292845726013184, "step": 148}
+{"train_info/time_between_train_steps": 0.005858898162841797, "step": 148}
+{"info/global_step": 149, "train_info/time_within_train_step": 27.324885845184326, "step": 149}
+{"train_info/time_between_train_steps": 0.005596637725830078, "step": 149}
+{"info/global_step": 150, "train_info/time_within_train_step": 27.326303482055664, "step": 150}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734013732, "_runtime": 4183}, "step": 150}
+{"logs": {"train/loss": 5.8905, "train/learning_rate": 0.0005833333333333333, "train/epoch": 5.01, "_timestamp": 1734013732, "_runtime": 4183}, "step": 150}
+{"train_info/time_between_train_steps": 0.00818634033203125, "step": 150}
+{"info/global_step": 151, "train_info/time_within_train_step": 27.416576623916626, "step": 151}
+{"train_info/time_between_train_steps": 0.005750894546508789, "step": 151}
+{"info/global_step": 152, "train_info/time_within_train_step": 27.378103256225586, "step": 152}
+{"train_info/time_between_train_steps": 0.010059356689453125, "step": 152}
+{"info/global_step": 153, "train_info/time_within_train_step": 27.339385271072388, "step": 153}
+{"train_info/time_between_train_steps": 0.005666255950927734, "step": 153}
+{"info/global_step": 154, "train_info/time_within_train_step": 27.283268451690674, "step": 154}
+{"train_info/time_between_train_steps": 0.010790109634399414, "step": 154}
+{"info/global_step": 155, "train_info/time_within_train_step": 27.376376628875732, "step": 155}
+{"train_info/time_between_train_steps": 0.006280660629272461, "step": 155}
+{"info/global_step": 156, "train_info/time_within_train_step": 27.351243019104004, "step": 156}
+{"train_info/time_between_train_steps": 0.009772300720214844, "step": 156}
+{"info/global_step": 157, "train_info/time_within_train_step": 27.368546962738037, "step": 157}
+{"train_info/time_between_train_steps": 0.006025552749633789, "step": 157}
+{"info/global_step": 158, "train_info/time_within_train_step": 27.45238995552063, "step": 158}
+{"train_info/time_between_train_steps": 0.010257244110107422, "step": 158}
+{"info/global_step": 159, "train_info/time_within_train_step": 27.408488750457764, "step": 159}
+{"train_info/time_between_train_steps": 0.011748790740966797, "step": 159}
+{"info/global_step": 160, "train_info/time_within_train_step": 27.437894821166992, "step": 160}
+{"train_info/time_between_train_steps": 0.005690097808837891, "step": 160}
+{"info/global_step": 161, "train_info/time_within_train_step": 27.38964533805847, "step": 161}
+{"train_info/time_between_train_steps": 0.005949258804321289, "step": 161}
+{"info/global_step": 162, "train_info/time_within_train_step": 27.528400421142578, "step": 162}
+{"train_info/time_between_train_steps": 0.0063571929931640625, "step": 162}
+{"train_info/time_between_train_steps": 13.216240882873535, "step": 162}
+{"info/global_step": 163, "train_info/time_within_train_step": 27.38889217376709, "step": 163}
+{"train_info/time_between_train_steps": 0.005834817886352539, "step": 163}
+{"info/global_step": 164, "train_info/time_within_train_step": 27.703904151916504, "step": 164}
+{"train_info/time_between_train_steps": 0.005795955657958984, "step": 164}
+{"info/global_step": 165, "train_info/time_within_train_step": 27.38463020324707, "step": 165}
+{"train_info/time_between_train_steps": 0.006268501281738281, "step": 165}
+{"info/global_step": 166, "train_info/time_within_train_step": 27.697975397109985, "step": 166}
+{"train_info/time_between_train_steps": 0.006405353546142578, "step": 166}
+{"info/global_step": 167, "train_info/time_within_train_step": 27.322188138961792, "step": 167}
+{"train_info/time_between_train_steps": 0.0062482357025146484, "step": 167}
+{"info/global_step": 168, "train_info/time_within_train_step": 27.54121208190918, "step": 168}
+{"train_info/time_between_train_steps": 0.006440401077270508, "step": 168}
+{"info/global_step": 169, "train_info/time_within_train_step": 27.3047776222229, "step": 169}
+{"train_info/time_between_train_steps": 0.005949258804321289, "step": 169}
+{"info/global_step": 170, "train_info/time_within_train_step": 27.472229719161987, "step": 170}
+{"train_info/time_between_train_steps": 0.005624055862426758, "step": 170}
+{"info/global_step": 171, "train_info/time_within_train_step": 27.27858281135559, "step": 171}
+{"train_info/time_between_train_steps": 0.005552530288696289, "step": 171}
+{"info/global_step": 172, "train_info/time_within_train_step": 27.26422619819641, "step": 172}
+{"train_info/time_between_train_steps": 0.005445241928100586, "step": 172}
+{"info/global_step": 173, "train_info/time_within_train_step": 27.441617488861084, "step": 173}
+{"train_info/time_between_train_steps": 0.005557060241699219, "step": 173}
+{"info/global_step": 174, "train_info/time_within_train_step": 27.50800585746765, "step": 174}
+{"train_info/time_between_train_steps": 0.005720853805541992, "step": 174}
+{"info/global_step": 175, "train_info/time_within_train_step": 27.549427270889282, "step": 175}
+{"train_info/time_between_train_steps": 0.0060503482818603516, "step": 175}
+{"info/global_step": 176, "train_info/time_within_train_step": 27.46770215034485, "step": 176}
+{"train_info/time_between_train_steps": 0.010228395462036133, "step": 176}
+{"info/global_step": 177, "train_info/time_within_train_step": 27.475542783737183, "step": 177}
+{"train_info/time_between_train_steps": 0.0059125423431396484, "step": 177}
+{"info/global_step": 178, "train_info/time_within_train_step": 27.363085508346558, "step": 178}
+{"train_info/time_between_train_steps": 0.007063388824462891, "step": 178}
+{"info/global_step": 179, "train_info/time_within_train_step": 27.40043807029724, "step": 179}
+{"train_info/time_between_train_steps": 0.006829261779785156, "step": 179}
+{"info/global_step": 180, "train_info/time_within_train_step": 27.404611825942993, "step": 180}
+{"train_info/time_between_train_steps": 0.006841182708740234, "step": 180}
+{"info/global_step": 181, "train_info/time_within_train_step": 27.369839191436768, "step": 181}
+{"train_info/time_between_train_steps": 0.005694150924682617, "step": 181}
+{"info/global_step": 182, "train_info/time_within_train_step": 27.34023380279541, "step": 182}
+{"train_info/time_between_train_steps": 0.005562543869018555, "step": 182}
+{"info/global_step": 183, "train_info/time_within_train_step": 27.428359746932983, "step": 183}
+{"train_info/time_between_train_steps": 0.00671696662902832, "step": 183}
+{"info/global_step": 184, "train_info/time_within_train_step": 27.47719120979309, "step": 184}
+{"train_info/time_between_train_steps": 0.006529569625854492, "step": 184}
+{"info/global_step": 185, "train_info/time_within_train_step": 27.424920558929443, "step": 185}
+{"train_info/time_between_train_steps": 0.006943464279174805, "step": 185}
+{"info/global_step": 186, "train_info/time_within_train_step": 27.621877908706665, "step": 186}
+{"train_info/time_between_train_steps": 0.006478309631347656, "step": 186}
+{"info/global_step": 187, "train_info/time_within_train_step": 27.346976280212402, "step": 187}
+{"train_info/time_between_train_steps": 0.006501436233520508, "step": 187}
+{"info/global_step": 188, "train_info/time_within_train_step": 27.45473885536194, "step": 188}
+{"train_info/time_between_train_steps": 0.011291742324829102, "step": 188}
+{"info/global_step": 189, "train_info/time_within_train_step": 27.598445653915405, "step": 189}
+{"train_info/time_between_train_steps": 0.012840986251831055, "step": 189}
+{"train_info/time_between_train_steps": 13.185821056365967, "step": 189}
+{"info/global_step": 190, "train_info/time_within_train_step": 27.375753164291382, "step": 190}
+{"train_info/time_between_train_steps": 0.010725975036621094, "step": 190}
+{"info/global_step": 191, "train_info/time_within_train_step": 27.6009783744812, "step": 191}
+{"train_info/time_between_train_steps": 0.006216526031494141, "step": 191}
+{"info/global_step": 192, "train_info/time_within_train_step": 27.405402660369873, "step": 192}
+{"train_info/time_between_train_steps": 0.012586832046508789, "step": 192}
+{"info/global_step": 193, "train_info/time_within_train_step": 27.606621503829956, "step": 193}
+{"train_info/time_between_train_steps": 0.007149934768676758, "step": 193}
+{"info/global_step": 194, "train_info/time_within_train_step": 27.48148798942566, "step": 194}
+{"train_info/time_between_train_steps": 0.006000995635986328, "step": 194}
+{"info/global_step": 195, "train_info/time_within_train_step": 27.536478757858276, "step": 195}
+{"train_info/time_between_train_steps": 0.011017799377441406, "step": 195}
+{"info/global_step": 196, "train_info/time_within_train_step": 27.510805130004883, "step": 196}
+{"train_info/time_between_train_steps": 0.0059185028076171875, "step": 196}
+{"info/global_step": 197, "train_info/time_within_train_step": 27.447265148162842, "step": 197}
+{"train_info/time_between_train_steps": 0.005402565002441406, "step": 197}
+{"info/global_step": 198, "train_info/time_within_train_step": 27.279973030090332, "step": 198}
+{"train_info/time_between_train_steps": 0.0056569576263427734, "step": 198}
+{"info/global_step": 199, "train_info/time_within_train_step": 27.45346999168396, "step": 199}
+{"train_info/time_between_train_steps": 0.009605646133422852, "step": 199}
+{"info/global_step": 200, "train_info/time_within_train_step": 27.440927028656006, "step": 200}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734015131, "_runtime": 5582}, "step": 200}
+{"logs": {"train/loss": 5.5027, "train/learning_rate": 0.0005555555555555556, "train/epoch": 7.01, "_timestamp": 1734015131, "_runtime": 5582}, "step": 200}
+{"train_info/time_between_train_steps": 2.568361282348633, "step": 200}
+{"info/global_step": 201, "train_info/time_within_train_step": 27.545247077941895, "step": 201}
+{"train_info/time_between_train_steps": 0.005521297454833984, "step": 201}
+{"info/global_step": 202, "train_info/time_within_train_step": 27.39992046356201, "step": 202}
+{"train_info/time_between_train_steps": 0.005584716796875, "step": 202}
+{"info/global_step": 203, "train_info/time_within_train_step": 27.39264678955078, "step": 203}
+{"train_info/time_between_train_steps": 0.008813858032226562, "step": 203}
+{"info/global_step": 204, "train_info/time_within_train_step": 27.362890243530273, "step": 204}
+{"train_info/time_between_train_steps": 0.010347127914428711, "step": 204}
+{"info/global_step": 205, "train_info/time_within_train_step": 27.441339254379272, "step": 205}
+{"train_info/time_between_train_steps": 0.0056073665618896484, "step": 205}
+{"info/global_step": 206, "train_info/time_within_train_step": 27.33486795425415, "step": 206}
+{"train_info/time_between_train_steps": 0.010640859603881836, "step": 206}
+{"info/global_step": 207, "train_info/time_within_train_step": 27.35040259361267, "step": 207}
+{"train_info/time_between_train_steps": 0.009897708892822266, "step": 207}
+{"info/global_step": 208, "train_info/time_within_train_step": 27.487857818603516, "step": 208}
+{"train_info/time_between_train_steps": 0.015609025955200195, "step": 208}
+{"info/global_step": 209, "train_info/time_within_train_step": 27.41507601737976, "step": 209}
+{"train_info/time_between_train_steps": 0.01051783561706543, "step": 209}
+{"info/global_step": 210, "train_info/time_within_train_step": 27.346322059631348, "step": 210}
+{"train_info/time_between_train_steps": 0.0057222843170166016, "step": 210}
+{"info/global_step": 211, "train_info/time_within_train_step": 27.354559898376465, "step": 211}
+{"train_info/time_between_train_steps": 0.010547161102294922, "step": 211}
+{"info/global_step": 212, "train_info/time_within_train_step": 27.425975561141968, "step": 212}
+{"train_info/time_between_train_steps": 0.005865573883056641, "step": 212}
+{"info/global_step": 213, "train_info/time_within_train_step": 27.238673210144043, "step": 213}
+{"train_info/time_between_train_steps": 0.0057981014251708984, "step": 213}
+{"info/global_step": 214, "train_info/time_within_train_step": 27.27722430229187, "step": 214}
+{"train_info/time_between_train_steps": 0.005895853042602539, "step": 214}
+{"info/global_step": 215, "train_info/time_within_train_step": 27.37804365158081, "step": 215}
+{"train_info/time_between_train_steps": 0.0059015750885009766, "step": 215}
+{"info/global_step": 216, "train_info/time_within_train_step": 27.367876768112183, "step": 216}
+{"train_info/time_between_train_steps": 0.006406307220458984, "step": 216}
+{"train_info/time_between_train_steps": 13.331374168395996, "step": 216}
+{"info/global_step": 217, "train_info/time_within_train_step": 27.242469310760498, "step": 217}
+{"train_info/time_between_train_steps": 0.0059642791748046875, "step": 217}
+{"info/global_step": 218, "train_info/time_within_train_step": 27.541590690612793, "step": 218}
+{"train_info/time_between_train_steps": 0.006680727005004883, "step": 218}
+{"info/global_step": 219, "train_info/time_within_train_step": 27.496817588806152, "step": 219}
+{"train_info/time_between_train_steps": 0.005746126174926758, "step": 219}
+{"info/global_step": 220, "train_info/time_within_train_step": 27.524900436401367, "step": 220}
+{"train_info/time_between_train_steps": 0.007933855056762695, "step": 220}
+{"info/global_step": 221, "train_info/time_within_train_step": 27.323824167251587, "step": 221}
+{"train_info/time_between_train_steps": 0.005676746368408203, "step": 221}
+{"info/global_step": 222, "train_info/time_within_train_step": 27.633001804351807, "step": 222}
+{"train_info/time_between_train_steps": 0.014851570129394531, "step": 222}
+{"info/global_step": 223, "train_info/time_within_train_step": 27.345974445343018, "step": 223}
+{"train_info/time_between_train_steps": 0.005997419357299805, "step": 223}
+{"info/global_step": 224, "train_info/time_within_train_step": 27.41454839706421, "step": 224}
+{"train_info/time_between_train_steps": 0.010034561157226562, "step": 224}
+{"info/global_step": 225, "train_info/time_within_train_step": 27.462934732437134, "step": 225}
+{"train_info/time_between_train_steps": 0.009026527404785156, "step": 225}
+{"info/global_step": 226, "train_info/time_within_train_step": 27.252289295196533, "step": 226}
+{"train_info/time_between_train_steps": 0.005316972732543945, "step": 226}
+{"info/global_step": 227, "train_info/time_within_train_step": 27.38498568534851, "step": 227}
+{"train_info/time_between_train_steps": 0.010526895523071289, "step": 227}
+{"info/global_step": 228, "train_info/time_within_train_step": 27.41074800491333, "step": 228}
+{"train_info/time_between_train_steps": 0.012090682983398438, "step": 228}
+{"info/global_step": 229, "train_info/time_within_train_step": 27.42793297767639, "step": 229}
+{"train_info/time_between_train_steps": 0.01070261001586914, "step": 229}
+{"info/global_step": 230, "train_info/time_within_train_step": 27.515073776245117, "step": 230}
+{"train_info/time_between_train_steps": 0.009697198867797852, "step": 230}
+{"info/global_step": 231, "train_info/time_within_train_step": 27.33136248588562, "step": 231}
+{"train_info/time_between_train_steps": 0.0054454803466796875, "step": 231}
+{"info/global_step": 232, "train_info/time_within_train_step": 27.59547758102417, "step": 232}
+{"train_info/time_between_train_steps": 0.0057790279388427734, "step": 232}
+{"info/global_step": 233, "train_info/time_within_train_step": 27.415841341018677, "step": 233}
+{"train_info/time_between_train_steps": 0.0055315494537353516, "step": 233}
+{"info/global_step": 234, "train_info/time_within_train_step": 27.403275728225708, "step": 234}
+{"train_info/time_between_train_steps": 0.010174989700317383, "step": 234}
+{"info/global_step": 235, "train_info/time_within_train_step": 27.487892150878906, "step": 235}
+{"train_info/time_between_train_steps": 0.005608797073364258, "step": 235}
+{"info/global_step": 236, "train_info/time_within_train_step": 27.461527347564697, "step": 236}
+{"train_info/time_between_train_steps": 0.006446123123168945, "step": 236}
+{"info/global_step": 237, "train_info/time_within_train_step": 27.461671352386475, "step": 237}
+{"train_info/time_between_train_steps": 0.01064753532409668, "step": 237}
+{"info/global_step": 238, "train_info/time_within_train_step": 27.464078187942505, "step": 238}
+{"train_info/time_between_train_steps": 0.00556492805480957, "step": 238}
+{"info/global_step": 239, "train_info/time_within_train_step": 27.472524881362915, "step": 239}
+{"train_info/time_between_train_steps": 0.005541324615478516, "step": 239}
+{"info/global_step": 240, "train_info/time_within_train_step": 27.41245698928833, "step": 240}
+{"train_info/time_between_train_steps": 0.005814313888549805, "step": 240}
+{"info/global_step": 241, "train_info/time_within_train_step": 27.467800855636597, "step": 241}
+{"train_info/time_between_train_steps": 0.010816574096679688, "step": 241}
+{"info/global_step": 242, "train_info/time_within_train_step": 27.639688730239868, "step": 242}
+{"train_info/time_between_train_steps": 0.008822441101074219, "step": 242}
+{"info/global_step": 243, "train_info/time_within_train_step": 27.50320315361023, "step": 243}
+{"train_info/time_between_train_steps": 0.00672602653503418, "step": 243}
+{"train_info/time_between_train_steps": 14.005026817321777, "step": 243}
+{"info/global_step": 244, "train_info/time_within_train_step": 27.377277374267578, "step": 244}
+{"train_info/time_between_train_steps": 0.005919933319091797, "step": 244}
+{"info/global_step": 245, "train_info/time_within_train_step": 27.518120765686035, "step": 245}
+{"train_info/time_between_train_steps": 0.0060253143310546875, "step": 245}
+{"info/global_step": 246, "train_info/time_within_train_step": 27.4103102684021, "step": 246}
+{"train_info/time_between_train_steps": 0.01123189926147461, "step": 246}
+{"info/global_step": 247, "train_info/time_within_train_step": 27.987909078598022, "step": 247}
+{"train_info/time_between_train_steps": 0.005808115005493164, "step": 247}
+{"info/global_step": 248, "train_info/time_within_train_step": 27.466455459594727, "step": 248}
+{"train_info/time_between_train_steps": 0.005928993225097656, "step": 248}
+{"info/global_step": 249, "train_info/time_within_train_step": 27.526191234588623, "step": 249}
+{"train_info/time_between_train_steps": 0.015189647674560547, "step": 249}
+{"info/global_step": 250, "train_info/time_within_train_step": 27.574074268341064, "step": 250}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734016533, "_runtime": 6984}, "step": 250}
+{"logs": {"train/loss": 5.2583, "train/learning_rate": 0.0005277777777777777, "train/epoch": 9.01, "_timestamp": 1734016533, "_runtime": 6984}, "step": 250}
+{"train_info/time_between_train_steps": 0.014083147048950195, "step": 250}
+{"info/global_step": 251, "train_info/time_within_train_step": 27.572014570236206, "step": 251}
+{"train_info/time_between_train_steps": 0.0055179595947265625, "step": 251}
+{"info/global_step": 252, "train_info/time_within_train_step": 27.417848110198975, "step": 252}
+{"train_info/time_between_train_steps": 0.005564451217651367, "step": 252}
+{"info/global_step": 253, "train_info/time_within_train_step": 27.353350400924683, "step": 253}
+{"train_info/time_between_train_steps": 0.005587100982666016, "step": 253}
+{"info/global_step": 254, "train_info/time_within_train_step": 27.2756564617157, "step": 254}
+{"train_info/time_between_train_steps": 0.0068950653076171875, "step": 254}
+{"info/global_step": 255, "train_info/time_within_train_step": 27.405113458633423, "step": 255}
+{"train_info/time_between_train_steps": 0.010515928268432617, "step": 255}
+{"info/global_step": 256, "train_info/time_within_train_step": 27.563188076019287, "step": 256}
+{"train_info/time_between_train_steps": 0.009900569915771484, "step": 256}
+{"info/global_step": 257, "train_info/time_within_train_step": 27.42639660835266, "step": 257}
+{"train_info/time_between_train_steps": 0.0055844783782958984, "step": 257}
+{"info/global_step": 258, "train_info/time_within_train_step": 27.343542337417603, "step": 258}
+{"train_info/time_between_train_steps": 0.005670309066772461, "step": 258}
+{"info/global_step": 259, "train_info/time_within_train_step": 27.52509641647339, "step": 259}
+{"train_info/time_between_train_steps": 0.005585193634033203, "step": 259}
+{"info/global_step": 260, "train_info/time_within_train_step": 27.42197036743164, "step": 260}
+{"train_info/time_between_train_steps": 0.012555360794067383, "step": 260}
+{"info/global_step": 261, "train_info/time_within_train_step": 27.32544493675232, "step": 261}
+{"train_info/time_between_train_steps": 0.00947260856628418, "step": 261}
+{"info/global_step": 262, "train_info/time_within_train_step": 27.339035034179688, "step": 262}
+{"train_info/time_between_train_steps": 0.005416154861450195, "step": 262}
+{"info/global_step": 263, "train_info/time_within_train_step": 27.573099374771118, "step": 263}
+{"train_info/time_between_train_steps": 0.011263132095336914, "step": 263}
+{"info/global_step": 264, "train_info/time_within_train_step": 27.479703903198242, "step": 264}
+{"train_info/time_between_train_steps": 0.0054666996002197266, "step": 264}
+{"info/global_step": 265, "train_info/time_within_train_step": 27.31727647781372, "step": 265}
+{"train_info/time_between_train_steps": 0.011972665786743164, "step": 265}
+{"info/global_step": 266, "train_info/time_within_train_step": 27.419361114501953, "step": 266}
+{"train_info/time_between_train_steps": 0.010699272155761719, "step": 266}
+{"info/global_step": 267, "train_info/time_within_train_step": 27.299298763275146, "step": 267}
+{"train_info/time_between_train_steps": 0.011120796203613281, "step": 267}
+{"info/global_step": 268, "train_info/time_within_train_step": 27.385231971740723, "step": 268}
+{"train_info/time_between_train_steps": 0.010875701904296875, "step": 268}
+{"info/global_step": 269, "train_info/time_within_train_step": 27.387897968292236, "step": 269}
+{"train_info/time_between_train_steps": 0.006551980972290039, "step": 269}
+{"info/global_step": 270, "train_info/time_within_train_step": 27.403189182281494, "step": 270}
+{"train_info/time_between_train_steps": 0.014547348022460938, "step": 270}
+{"train_info/time_between_train_steps": 12.942928075790405, "step": 270}
+{"info/global_step": 271, "train_info/time_within_train_step": 27.31177043914795, "step": 271}
+{"train_info/time_between_train_steps": 0.005806446075439453, "step": 271}
+{"info/global_step": 272, "train_info/time_within_train_step": 27.486700534820557, "step": 272}
+{"train_info/time_between_train_steps": 0.007085084915161133, "step": 272}
+{"info/global_step": 273, "train_info/time_within_train_step": 27.311779022216797, "step": 273}
+{"train_info/time_between_train_steps": 0.005820274353027344, "step": 273}
+{"info/global_step": 274, "train_info/time_within_train_step": 27.49738383293152, "step": 274}
+{"train_info/time_between_train_steps": 0.010986804962158203, "step": 274}
+{"info/global_step": 275, "train_info/time_within_train_step": 27.40323305130005, "step": 275}
+{"train_info/time_between_train_steps": 0.005768537521362305, "step": 275}
+{"info/global_step": 276, "train_info/time_within_train_step": 27.487024068832397, "step": 276}
+{"train_info/time_between_train_steps": 0.005671262741088867, "step": 276}
+{"info/global_step": 277, "train_info/time_within_train_step": 27.227267026901245, "step": 277}
+{"train_info/time_between_train_steps": 0.005816459655761719, "step": 277}
+{"info/global_step": 278, "train_info/time_within_train_step": 27.52314329147339, "step": 278}
+{"train_info/time_between_train_steps": 0.005135297775268555, "step": 278}
+{"info/global_step": 279, "train_info/time_within_train_step": 27.27011513710022, "step": 279}
+{"train_info/time_between_train_steps": 0.0052645206451416016, "step": 279}
+{"info/global_step": 280, "train_info/time_within_train_step": 27.438360452651978, "step": 280}
+{"train_info/time_between_train_steps": 0.009800910949707031, "step": 280}
+{"info/global_step": 281, "train_info/time_within_train_step": 27.275152683258057, "step": 281}
+{"train_info/time_between_train_steps": 0.010220527648925781, "step": 281}
+{"info/global_step": 282, "train_info/time_within_train_step": 27.287874460220337, "step": 282}
+{"train_info/time_between_train_steps": 0.009453296661376953, "step": 282}
+{"info/global_step": 283, "train_info/time_within_train_step": 27.3619487285614, "step": 283}
+{"train_info/time_between_train_steps": 0.005300283432006836, "step": 283}
+{"info/global_step": 284, "train_info/time_within_train_step": 27.389939069747925, "step": 284}
+{"train_info/time_between_train_steps": 0.005606174468994141, "step": 284}
+{"info/global_step": 285, "train_info/time_within_train_step": 27.388286352157593, "step": 285}
+{"train_info/time_between_train_steps": 0.0052378177642822266, "step": 285}
+{"info/global_step": 286, "train_info/time_within_train_step": 27.54313564300537, "step": 286}
+{"train_info/time_between_train_steps": 0.005471229553222656, "step": 286}
+{"info/global_step": 287, "train_info/time_within_train_step": 62.113768339157104, "step": 287}
+{"train_info/time_between_train_steps": 0.010213851928710938, "step": 287}
+{"info/global_step": 288, "train_info/time_within_train_step": 51.619630336761475, "step": 288}
+{"train_info/time_between_train_steps": 0.005505561828613281, "step": 288}
+{"info/global_step": 289, "train_info/time_within_train_step": 49.04721999168396, "step": 289}
+{"train_info/time_between_train_steps": 0.016670703887939453, "step": 289}
+{"info/global_step": 290, "train_info/time_within_train_step": 62.694255113601685, "step": 290}
+{"train_info/time_between_train_steps": 0.015348196029663086, "step": 290}
+{"info/global_step": 291, "train_info/time_within_train_step": 39.349955558776855, "step": 291}
+{"train_info/time_between_train_steps": 0.010707855224609375, "step": 291}
+{"info/global_step": 292, "train_info/time_within_train_step": 63.27240610122681, "step": 292}
+{"train_info/time_between_train_steps": 0.016766071319580078, "step": 292}
+{"info/global_step": 293, "train_info/time_within_train_step": 62.66075897216797, "step": 293}
+{"train_info/time_between_train_steps": 0.005844593048095703, "step": 293}
+{"info/global_step": 294, "train_info/time_within_train_step": 63.79219722747803, "step": 294}
+{"train_info/time_between_train_steps": 0.005786418914794922, "step": 294}
+{"info/global_step": 295, "train_info/time_within_train_step": 39.02157211303711, "step": 295}
+{"train_info/time_between_train_steps": 0.010480165481567383, "step": 295}
+{"info/global_step": 296, "train_info/time_within_train_step": 63.25954031944275, "step": 296}
+{"train_info/time_between_train_steps": 0.006104469299316406, "step": 296}
+{"info/global_step": 297, "train_info/time_within_train_step": 38.78747344017029, "step": 297}
+{"train_info/time_between_train_steps": 0.016247034072875977, "step": 297}
+{"train_info/time_between_train_steps": 24.879022359848022, "step": 297}
+{"info/global_step": 298, "train_info/time_within_train_step": 63.05013298988342, "step": 298}
+{"train_info/time_between_train_steps": 0.010804176330566406, "step": 298}
+{"info/global_step": 299, "train_info/time_within_train_step": 38.07649254798889, "step": 299}
+{"train_info/time_between_train_steps": 0.006056308746337891, "step": 299}
+{"info/global_step": 300, "train_info/time_within_train_step": 63.34940791130066, "step": 300}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734018318, "_runtime": 8769}, "step": 300}
+{"logs": {"train/loss": 5.09, "train/learning_rate": 0.0005, "train/epoch": 11.0, "_timestamp": 1734018318, "_runtime": 8769}, "step": 300}
+{"train_info/time_between_train_steps": 2.725443124771118, "step": 300}
+{"info/global_step": 301, "train_info/time_within_train_step": 43.05689239501953, "step": 301}
+{"train_info/time_between_train_steps": 0.005789518356323242, "step": 301}
+{"info/global_step": 302, "train_info/time_within_train_step": 58.94157671928406, "step": 302}
+{"train_info/time_between_train_steps": 0.006877422332763672, "step": 302}
+{"info/global_step": 303, "train_info/time_within_train_step": 62.772634506225586, "step": 303}
+{"train_info/time_between_train_steps": 0.006282329559326172, "step": 303}
+{"info/global_step": 304, "train_info/time_within_train_step": 63.19356369972229, "step": 304}
+{"train_info/time_between_train_steps": 0.006329536437988281, "step": 304}
+{"info/global_step": 305, "train_info/time_within_train_step": 62.620800733566284, "step": 305}
+{"train_info/time_between_train_steps": 0.005703926086425781, "step": 305}
+{"info/global_step": 306, "train_info/time_within_train_step": 62.71409749984741, "step": 306}
+{"train_info/time_between_train_steps": 0.005829811096191406, "step": 306}
+{"info/global_step": 307, "train_info/time_within_train_step": 63.10847234725952, "step": 307}
+{"train_info/time_between_train_steps": 0.010767936706542969, "step": 307}
+{"info/global_step": 308, "train_info/time_within_train_step": 62.37164926528931, "step": 308}
+{"train_info/time_between_train_steps": 0.010652542114257812, "step": 308}
+{"info/global_step": 309, "train_info/time_within_train_step": 64.93454504013062, "step": 309}
+{"train_info/time_between_train_steps": 0.010000228881835938, "step": 309}
+{"info/global_step": 310, "train_info/time_within_train_step": 62.444257497787476, "step": 310}
+{"train_info/time_between_train_steps": 0.00550389289855957, "step": 310}
+{"info/global_step": 311, "train_info/time_within_train_step": 62.527754068374634, "step": 311}
+{"train_info/time_between_train_steps": 0.007643699645996094, "step": 311}
+{"info/global_step": 312, "train_info/time_within_train_step": 35.25742530822754, "step": 312}
+{"train_info/time_between_train_steps": 0.012233495712280273, "step": 312}
+{"info/global_step": 313, "train_info/time_within_train_step": 63.279733419418335, "step": 313}
+{"train_info/time_between_train_steps": 0.0058934688568115234, "step": 313}
+{"info/global_step": 314, "train_info/time_within_train_step": 62.38103532791138, "step": 314}
+{"train_info/time_between_train_steps": 0.010141134262084961, "step": 314}
+{"info/global_step": 315, "train_info/time_within_train_step": 62.90363359451294, "step": 315}
+{"train_info/time_between_train_steps": 0.010444879531860352, "step": 315}
+{"info/global_step": 316, "train_info/time_within_train_step": 38.71703815460205, "step": 316}
+{"train_info/time_between_train_steps": 0.005684375762939453, "step": 316}
+{"info/global_step": 317, "train_info/time_within_train_step": 57.18065404891968, "step": 317}
+{"train_info/time_between_train_steps": 0.010752201080322266, "step": 317}
+{"info/global_step": 318, "train_info/time_within_train_step": 60.14686155319214, "step": 318}
+{"train_info/time_between_train_steps": 0.02055811882019043, "step": 318}
+{"info/global_step": 319, "train_info/time_within_train_step": 61.0486376285553, "step": 319}
+{"train_info/time_between_train_steps": 0.015067338943481445, "step": 319}
+{"info/global_step": 320, "train_info/time_within_train_step": 43.31436204910278, "step": 320}
+{"train_info/time_between_train_steps": 0.005670785903930664, "step": 320}
+{"info/global_step": 321, "train_info/time_within_train_step": 27.510136604309082, "step": 321}
+{"train_info/time_between_train_steps": 0.0057833194732666016, "step": 321}
+{"info/global_step": 322, "train_info/time_within_train_step": 27.48550820350647, "step": 322}
+{"train_info/time_between_train_steps": 0.00570988655090332, "step": 322}
+{"info/global_step": 323, "train_info/time_within_train_step": 38.080469846725464, "step": 323}
+{"train_info/time_between_train_steps": 0.005852460861206055, "step": 323}
+{"info/global_step": 324, "train_info/time_within_train_step": 63.27030158042908, "step": 324}
+{"train_info/time_between_train_steps": 0.01132345199584961, "step": 324}
+{"train_info/time_between_train_steps": 25.5912868976593, "step": 324}
+{"info/global_step": 325, "train_info/time_within_train_step": 44.016984939575195, "step": 325}
+{"train_info/time_between_train_steps": 0.0075397491455078125, "step": 325}
+{"info/global_step": 326, "train_info/time_within_train_step": 63.211859941482544, "step": 326}
+{"train_info/time_between_train_steps": 0.009507417678833008, "step": 326}
+{"info/global_step": 327, "train_info/time_within_train_step": 39.16793966293335, "step": 327}
+{"train_info/time_between_train_steps": 0.0061991214752197266, "step": 327}
+{"info/global_step": 328, "train_info/time_within_train_step": 63.53343939781189, "step": 328}
+{"train_info/time_between_train_steps": 0.006281137466430664, "step": 328}
+{"info/global_step": 329, "train_info/time_within_train_step": 44.62076425552368, "step": 329}
+{"train_info/time_between_train_steps": 0.006032466888427734, "step": 329}
+{"info/global_step": 330, "train_info/time_within_train_step": 56.673625469207764, "step": 330}
+{"train_info/time_between_train_steps": 0.006520748138427734, "step": 330}
+{"info/global_step": 331, "train_info/time_within_train_step": 62.45552182197571, "step": 331}
+{"train_info/time_between_train_steps": 0.00608062744140625, "step": 331}
+{"info/global_step": 332, "train_info/time_within_train_step": 63.37776446342468, "step": 332}
+{"train_info/time_between_train_steps": 0.0053958892822265625, "step": 332}
+{"info/global_step": 333, "train_info/time_within_train_step": 44.51482439041138, "step": 333}
+{"train_info/time_between_train_steps": 0.00539398193359375, "step": 333}
+{"info/global_step": 334, "train_info/time_within_train_step": 57.61631727218628, "step": 334}
+{"train_info/time_between_train_steps": 0.005710601806640625, "step": 334}
+{"info/global_step": 335, "train_info/time_within_train_step": 55.79638600349426, "step": 335}
+{"train_info/time_between_train_steps": 0.00545954704284668, "step": 335}
+{"info/global_step": 336, "train_info/time_within_train_step": 43.167993783950806, "step": 336}
+{"train_info/time_between_train_steps": 0.005704402923583984, "step": 336}
+{"info/global_step": 337, "train_info/time_within_train_step": 62.88469696044922, "step": 337}
+{"train_info/time_between_train_steps": 0.007785320281982422, "step": 337}
+{"info/global_step": 338, "train_info/time_within_train_step": 37.75188684463501, "step": 338}
+{"train_info/time_between_train_steps": 0.005894660949707031, "step": 338}
+{"info/global_step": 339, "train_info/time_within_train_step": 63.265533208847046, "step": 339}
+{"train_info/time_between_train_steps": 0.005821704864501953, "step": 339}
+{"info/global_step": 340, "train_info/time_within_train_step": 63.89545726776123, "step": 340}
+{"train_info/time_between_train_steps": 0.011835098266601562, "step": 340}
+{"info/global_step": 341, "train_info/time_within_train_step": 62.91144275665283, "step": 341}
+{"train_info/time_between_train_steps": 0.005738973617553711, "step": 341}
+{"info/global_step": 342, "train_info/time_within_train_step": 35.34277892112732, "step": 342}
+{"train_info/time_between_train_steps": 0.01066899299621582, "step": 342}
+{"info/global_step": 343, "train_info/time_within_train_step": 63.35046315193176, "step": 343}
+{"train_info/time_between_train_steps": 0.010930538177490234, "step": 343}
+{"info/global_step": 344, "train_info/time_within_train_step": 62.42418146133423, "step": 344}
+{"train_info/time_between_train_steps": 0.005555629730224609, "step": 344}
+{"info/global_step": 345, "train_info/time_within_train_step": 63.136096239089966, "step": 345}
+{"train_info/time_between_train_steps": 0.007703065872192383, "step": 345}
+{"info/global_step": 346, "train_info/time_within_train_step": 62.46624135971069, "step": 346}
+{"train_info/time_between_train_steps": 0.008013725280761719, "step": 346}
+{"info/global_step": 347, "train_info/time_within_train_step": 62.25688028335571, "step": 347}
+{"train_info/time_between_train_steps": 0.013287544250488281, "step": 347}
+{"info/global_step": 348, "train_info/time_within_train_step": 63.2596070766449, "step": 348}
+{"train_info/time_between_train_steps": 0.0060291290283203125, "step": 348}
+{"info/global_step": 349, "train_info/time_within_train_step": 62.33553075790405, "step": 349}
+{"train_info/time_between_train_steps": 0.006972551345825195, "step": 349}
+{"info/global_step": 350, "train_info/time_within_train_step": 63.121176958084106, "step": 350}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734021122, "_runtime": 11573}, "step": 350}
+{"logs": {"train/loss": 4.9223, "train/learning_rate": 0.00047222222222222224, "train/epoch": 12.02, "_timestamp": 1734021122, "_runtime": 11573}, "step": 350}
+{"train_info/time_between_train_steps": 0.022982358932495117, "step": 350}
+{"info/global_step": 351, "train_info/time_within_train_step": 62.65292811393738, "step": 351}
+{"train_info/time_between_train_steps": 0.008503198623657227, "step": 351}
+{"train_info/time_between_train_steps": 25.963787317276, "step": 351}
+{"info/global_step": 352, "train_info/time_within_train_step": 62.59736180305481, "step": 352}
+{"train_info/time_between_train_steps": 0.006203413009643555, "step": 352}
+{"info/global_step": 353, "train_info/time_within_train_step": 56.03219437599182, "step": 353}
+{"train_info/time_between_train_steps": 0.008867502212524414, "step": 353}
+{"info/global_step": 354, "train_info/time_within_train_step": 61.08942484855652, "step": 354}
+{"train_info/time_between_train_steps": 0.009290933609008789, "step": 354}
+{"info/global_step": 355, "train_info/time_within_train_step": 62.63206505775452, "step": 355}
+{"train_info/time_between_train_steps": 0.013931989669799805, "step": 355}
+{"info/global_step": 356, "train_info/time_within_train_step": 28.93071961402893, "step": 356}
+{"train_info/time_between_train_steps": 0.006063938140869141, "step": 356}
+{"info/global_step": 357, "train_info/time_within_train_step": 27.60074281692505, "step": 357}
+{"train_info/time_between_train_steps": 0.006150722503662109, "step": 357}
+{"info/global_step": 358, "train_info/time_within_train_step": 27.416497230529785, "step": 358}
+{"train_info/time_between_train_steps": 0.010146856307983398, "step": 358}
+{"info/global_step": 359, "train_info/time_within_train_step": 52.6373553276062, "step": 359}
+{"train_info/time_between_train_steps": 0.012560606002807617, "step": 359}
+{"info/global_step": 360, "train_info/time_within_train_step": 61.19530987739563, "step": 360}
+{"train_info/time_between_train_steps": 0.015728235244750977, "step": 360}
+{"info/global_step": 361, "train_info/time_within_train_step": 39.55336856842041, "step": 361}
+{"train_info/time_between_train_steps": 0.010004043579101562, "step": 361}
+{"info/global_step": 362, "train_info/time_within_train_step": 63.19040870666504, "step": 362}
+{"train_info/time_between_train_steps": 0.00986170768737793, "step": 362}
+{"info/global_step": 363, "train_info/time_within_train_step": 38.63350439071655, "step": 363}
+{"train_info/time_between_train_steps": 0.005837440490722656, "step": 363}
+{"info/global_step": 364, "train_info/time_within_train_step": 62.951841592788696, "step": 364}
+{"train_info/time_between_train_steps": 0.010553598403930664, "step": 364}
+{"info/global_step": 365, "train_info/time_within_train_step": 62.53174638748169, "step": 365}
+{"train_info/time_between_train_steps": 0.00979304313659668, "step": 365}
+{"info/global_step": 366, "train_info/time_within_train_step": 63.35952949523926, "step": 366}
+{"train_info/time_between_train_steps": 0.007644176483154297, "step": 366}
+{"info/global_step": 367, "train_info/time_within_train_step": 62.65034341812134, "step": 367}
+{"train_info/time_between_train_steps": 0.010164737701416016, "step": 367}
+{"info/global_step": 368, "train_info/time_within_train_step": 62.3890860080719, "step": 368}
+{"train_info/time_between_train_steps": 0.005559206008911133, "step": 368}
+{"info/global_step": 369, "train_info/time_within_train_step": 37.466148376464844, "step": 369}
+{"train_info/time_between_train_steps": 0.006103038787841797, "step": 369}
+{"info/global_step": 370, "train_info/time_within_train_step": 63.20536994934082, "step": 370}
+{"train_info/time_between_train_steps": 0.005428791046142578, "step": 370}
+{"info/global_step": 371, "train_info/time_within_train_step": 40.26640796661377, "step": 371}
+{"train_info/time_between_train_steps": 0.008432626724243164, "step": 371}
+{"info/global_step": 372, "train_info/time_within_train_step": 61.57385516166687, "step": 372}
+{"train_info/time_between_train_steps": 0.0059833526611328125, "step": 372}
+{"info/global_step": 373, "train_info/time_within_train_step": 52.18363070487976, "step": 373}
+{"train_info/time_between_train_steps": 0.0055615901947021484, "step": 373}
+{"info/global_step": 374, "train_info/time_within_train_step": 46.723185539245605, "step": 374}
+{"train_info/time_between_train_steps": 0.012018442153930664, "step": 374}
+{"info/global_step": 375, "train_info/time_within_train_step": 62.76984357833862, "step": 375}
+{"train_info/time_between_train_steps": 0.010439157485961914, "step": 375}
+{"info/global_step": 376, "train_info/time_within_train_step": 63.01080536842346, "step": 376}
+{"train_info/time_between_train_steps": 0.0057506561279296875, "step": 376}
+{"info/global_step": 377, "train_info/time_within_train_step": 62.619691371917725, "step": 377}
+{"train_info/time_between_train_steps": 0.011093854904174805, "step": 377}
+{"info/global_step": 378, "train_info/time_within_train_step": 63.47130537033081, "step": 378}
+{"train_info/time_between_train_steps": 0.006651639938354492, "step": 378}
+{"train_info/time_between_train_steps": 25.90372371673584, "step": 378}
+{"info/global_step": 379, "train_info/time_within_train_step": 63.33312368392944, "step": 379}
+{"train_info/time_between_train_steps": 0.008546590805053711, "step": 379}
+{"info/global_step": 380, "train_info/time_within_train_step": 62.79398536682129, "step": 380}
+{"train_info/time_between_train_steps": 0.006976127624511719, "step": 380}
+{"info/global_step": 381, "train_info/time_within_train_step": 63.088123083114624, "step": 381}
+{"train_info/time_between_train_steps": 0.006814002990722656, "step": 381}
+{"info/global_step": 382, "train_info/time_within_train_step": 62.92134380340576, "step": 382}
+{"train_info/time_between_train_steps": 0.015470504760742188, "step": 382}
+{"info/global_step": 383, "train_info/time_within_train_step": 62.46375918388367, "step": 383}
+{"train_info/time_between_train_steps": 0.005959033966064453, "step": 383}
+{"info/global_step": 384, "train_info/time_within_train_step": 63.57549524307251, "step": 384}
+{"train_info/time_between_train_steps": 0.011436939239501953, "step": 384}
+{"info/global_step": 385, "train_info/time_within_train_step": 62.574257373809814, "step": 385}
+{"train_info/time_between_train_steps": 0.00833892822265625, "step": 385}
+{"info/global_step": 386, "train_info/time_within_train_step": 64.62696290016174, "step": 386}
+{"train_info/time_between_train_steps": 0.012309551239013672, "step": 386}
+{"info/global_step": 387, "train_info/time_within_train_step": 62.91460633277893, "step": 387}
+{"train_info/time_between_train_steps": 0.005742073059082031, "step": 387}
+{"info/global_step": 388, "train_info/time_within_train_step": 54.99428129196167, "step": 388}
+{"train_info/time_between_train_steps": 0.007470846176147461, "step": 388}
+{"info/global_step": 389, "train_info/time_within_train_step": 61.03674125671387, "step": 389}
+{"train_info/time_between_train_steps": 0.014876127243041992, "step": 389}
+{"info/global_step": 390, "train_info/time_within_train_step": 56.38670539855957, "step": 390}
+{"train_info/time_between_train_steps": 0.005488395690917969, "step": 390}
+{"info/global_step": 391, "train_info/time_within_train_step": 27.442161560058594, "step": 391}
+{"train_info/time_between_train_steps": 0.01014566421508789, "step": 391}
+{"info/global_step": 392, "train_info/time_within_train_step": 27.39665412902832, "step": 392}
+{"train_info/time_between_train_steps": 0.009151935577392578, "step": 392}
+{"info/global_step": 393, "train_info/time_within_train_step": 27.275139808654785, "step": 393}
+{"train_info/time_between_train_steps": 0.010471582412719727, "step": 393}
+{"info/global_step": 394, "train_info/time_within_train_step": 27.452674388885498, "step": 394}
+{"train_info/time_between_train_steps": 0.00555419921875, "step": 394}
+{"info/global_step": 395, "train_info/time_within_train_step": 27.414249658584595, "step": 395}
+{"train_info/time_between_train_steps": 0.010443925857543945, "step": 395}
+{"info/global_step": 396, "train_info/time_within_train_step": 27.40784215927124, "step": 396}
+{"train_info/time_between_train_steps": 0.015532732009887695, "step": 396}
+{"info/global_step": 397, "train_info/time_within_train_step": 27.337252378463745, "step": 397}
+{"train_info/time_between_train_steps": 0.005442380905151367, "step": 397}
+{"info/global_step": 398, "train_info/time_within_train_step": 27.360122680664062, "step": 398}
+{"train_info/time_between_train_steps": 0.005450725555419922, "step": 398}
+{"info/global_step": 399, "train_info/time_within_train_step": 27.408414840698242, "step": 399}
+{"train_info/time_between_train_steps": 0.0055730342864990234, "step": 399}
+{"info/global_step": 400, "train_info/time_within_train_step": 27.41246509552002, "step": 400}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734023701, "_runtime": 14152}, "step": 400}
+{"logs": {"train/loss": 4.7935, "train/learning_rate": 0.00044444444444444436, "train/epoch": 14.02, "_timestamp": 1734023701, "_runtime": 14152}, "step": 400}
+{"train_info/time_between_train_steps": 2.476400375366211, "step": 400}
+{"info/global_step": 401, "train_info/time_within_train_step": 27.35844397544861, "step": 401}
+{"train_info/time_between_train_steps": 0.005720376968383789, "step": 401}
+{"info/global_step": 402, "train_info/time_within_train_step": 27.44756293296814, "step": 402}
+{"train_info/time_between_train_steps": 0.005609273910522461, "step": 402}
+{"info/global_step": 403, "train_info/time_within_train_step": 27.32063341140747, "step": 403}
+{"train_info/time_between_train_steps": 0.010939836502075195, "step": 403}
+{"info/global_step": 404, "train_info/time_within_train_step": 27.27696919441223, "step": 404}
+{"train_info/time_between_train_steps": 0.009904861450195312, "step": 404}
+{"info/global_step": 405, "train_info/time_within_train_step": 27.402870655059814, "step": 405}
+{"train_info/time_between_train_steps": 0.006539583206176758, "step": 405}
+{"train_info/time_between_train_steps": 13.184408903121948, "step": 405}
+{"info/global_step": 406, "train_info/time_within_train_step": 27.431373119354248, "step": 406}
+{"train_info/time_between_train_steps": 0.010444402694702148, "step": 406}
+{"info/global_step": 407, "train_info/time_within_train_step": 27.436427116394043, "step": 407}
+{"train_info/time_between_train_steps": 0.005595684051513672, "step": 407}
+{"info/global_step": 408, "train_info/time_within_train_step": 27.235920906066895, "step": 408}
+{"train_info/time_between_train_steps": 0.005442142486572266, "step": 408}
+{"info/global_step": 409, "train_info/time_within_train_step": 27.40474843978882, "step": 409}
+{"train_info/time_between_train_steps": 0.005550861358642578, "step": 409}
+{"info/global_step": 410, "train_info/time_within_train_step": 27.283785581588745, "step": 410}
+{"train_info/time_between_train_steps": 0.005525827407836914, "step": 410}
+{"info/global_step": 411, "train_info/time_within_train_step": 27.405362129211426, "step": 411}
+{"train_info/time_between_train_steps": 0.0055696964263916016, "step": 411}
+{"info/global_step": 412, "train_info/time_within_train_step": 27.235255002975464, "step": 412}
+{"train_info/time_between_train_steps": 0.005337953567504883, "step": 412}
+{"info/global_step": 413, "train_info/time_within_train_step": 27.267664670944214, "step": 413}
+{"train_info/time_between_train_steps": 0.005177736282348633, "step": 413}
+{"info/global_step": 414, "train_info/time_within_train_step": 27.32730531692505, "step": 414}
+{"train_info/time_between_train_steps": 0.005349159240722656, "step": 414}
+{"info/global_step": 415, "train_info/time_within_train_step": 27.41621732711792, "step": 415}
+{"train_info/time_between_train_steps": 0.005143880844116211, "step": 415}
+{"info/global_step": 416, "train_info/time_within_train_step": 27.236698865890503, "step": 416}
+{"train_info/time_between_train_steps": 0.0052738189697265625, "step": 416}
+{"info/global_step": 417, "train_info/time_within_train_step": 27.513365030288696, "step": 417}
+{"train_info/time_between_train_steps": 0.0054819583892822266, "step": 417}
+{"info/global_step": 418, "train_info/time_within_train_step": 27.372018814086914, "step": 418}
+{"train_info/time_between_train_steps": 0.005272626876831055, "step": 418}
+{"info/global_step": 419, "train_info/time_within_train_step": 27.251151084899902, "step": 419}
+{"train_info/time_between_train_steps": 0.010369062423706055, "step": 419}
+{"info/global_step": 420, "train_info/time_within_train_step": 27.223424434661865, "step": 420}
+{"train_info/time_between_train_steps": 0.007292270660400391, "step": 420}
+{"info/global_step": 421, "train_info/time_within_train_step": 27.294984579086304, "step": 421}
+{"train_info/time_between_train_steps": 0.005219697952270508, "step": 421}
+{"info/global_step": 422, "train_info/time_within_train_step": 27.456367254257202, "step": 422}
+{"train_info/time_between_train_steps": 0.009551525115966797, "step": 422}
+{"info/global_step": 423, "train_info/time_within_train_step": 27.361812114715576, "step": 423}
+{"train_info/time_between_train_steps": 0.005151987075805664, "step": 423}
+{"info/global_step": 424, "train_info/time_within_train_step": 27.362172842025757, "step": 424}
+{"train_info/time_between_train_steps": 0.01465153694152832, "step": 424}
+{"info/global_step": 425, "train_info/time_within_train_step": 27.449859380722046, "step": 425}
+{"train_info/time_between_train_steps": 0.0052831172943115234, "step": 425}
+{"info/global_step": 426, "train_info/time_within_train_step": 27.255072116851807, "step": 426}
+{"train_info/time_between_train_steps": 0.010275125503540039, "step": 426}
+{"info/global_step": 427, "train_info/time_within_train_step": 27.464863300323486, "step": 427}
+{"train_info/time_between_train_steps": 0.014850854873657227, "step": 427}
+{"info/global_step": 428, "train_info/time_within_train_step": 27.35314130783081, "step": 428}
+{"train_info/time_between_train_steps": 0.0053141117095947266, "step": 428}
+{"info/global_step": 429, "train_info/time_within_train_step": 27.404520273208618, "step": 429}
+{"train_info/time_between_train_steps": 0.005590200424194336, "step": 429}
+{"info/global_step": 430, "train_info/time_within_train_step": 27.397706031799316, "step": 430}
+{"train_info/time_between_train_steps": 0.0055694580078125, "step": 430}
+{"info/global_step": 431, "train_info/time_within_train_step": 27.50855278968811, "step": 431}
+{"train_info/time_between_train_steps": 0.01034998893737793, "step": 431}
+{"info/global_step": 432, "train_info/time_within_train_step": 27.400508642196655, "step": 432}
+{"train_info/time_between_train_steps": 0.006328582763671875, "step": 432}
+{"train_info/time_between_train_steps": 13.634516477584839, "step": 432}
+{"info/global_step": 433, "train_info/time_within_train_step": 27.39557695388794, "step": 433}
+{"train_info/time_between_train_steps": 0.0060999393463134766, "step": 433}
+{"info/global_step": 434, "train_info/time_within_train_step": 27.514052152633667, "step": 434}
+{"train_info/time_between_train_steps": 0.005793094635009766, "step": 434}
+{"info/global_step": 435, "train_info/time_within_train_step": 27.438849925994873, "step": 435}
+{"train_info/time_between_train_steps": 0.006095409393310547, "step": 435}
+{"info/global_step": 436, "train_info/time_within_train_step": 27.546039819717407, "step": 436}
+{"train_info/time_between_train_steps": 0.005712270736694336, "step": 436}
+{"info/global_step": 437, "train_info/time_within_train_step": 27.454644203186035, "step": 437}
+{"train_info/time_between_train_steps": 0.011202096939086914, "step": 437}
+{"info/global_step": 438, "train_info/time_within_train_step": 27.942503929138184, "step": 438}
+{"train_info/time_between_train_steps": 0.010326147079467773, "step": 438}
+{"info/global_step": 439, "train_info/time_within_train_step": 27.43771505355835, "step": 439}
+{"train_info/time_between_train_steps": 0.010714292526245117, "step": 439}
+{"info/global_step": 440, "train_info/time_within_train_step": 27.47238802909851, "step": 440}
+{"train_info/time_between_train_steps": 0.005444049835205078, "step": 440}
+{"info/global_step": 441, "train_info/time_within_train_step": 27.417526960372925, "step": 441}
+{"train_info/time_between_train_steps": 0.010313272476196289, "step": 441}
+{"info/global_step": 442, "train_info/time_within_train_step": 27.60570979118347, "step": 442}
+{"train_info/time_between_train_steps": 0.010219335556030273, "step": 442}
+{"info/global_step": 443, "train_info/time_within_train_step": 27.564589262008667, "step": 443}
+{"train_info/time_between_train_steps": 0.010411977767944336, "step": 443}
+{"info/global_step": 444, "train_info/time_within_train_step": 27.527841091156006, "step": 444}
+{"train_info/time_between_train_steps": 0.005663871765136719, "step": 444}
+{"info/global_step": 445, "train_info/time_within_train_step": 27.54188632965088, "step": 445}
+{"train_info/time_between_train_steps": 0.005589485168457031, "step": 445}
+{"info/global_step": 446, "train_info/time_within_train_step": 27.326794862747192, "step": 446}
+{"train_info/time_between_train_steps": 0.005503416061401367, "step": 446}
+{"info/global_step": 447, "train_info/time_within_train_step": 27.360533952713013, "step": 447}
+{"train_info/time_between_train_steps": 0.0054514408111572266, "step": 447}
+{"info/global_step": 448, "train_info/time_within_train_step": 27.39722967147827, "step": 448}
+{"train_info/time_between_train_steps": 0.010307073593139648, "step": 448}
+{"info/global_step": 449, "train_info/time_within_train_step": 27.385610103607178, "step": 449}
+{"train_info/time_between_train_steps": 0.005495548248291016, "step": 449}
+{"info/global_step": 450, "train_info/time_within_train_step": 27.327661991119385, "step": 450}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734025101, "_runtime": 15552}, "step": 450}
+{"logs": {"train/loss": 4.6037, "train/learning_rate": 0.00041666666666666664, "train/epoch": 16.02, "_timestamp": 1734025101, "_runtime": 15552}, "step": 450}
+{"train_info/time_between_train_steps": 0.007613182067871094, "step": 450}
+{"info/global_step": 451, "train_info/time_within_train_step": 27.35515594482422, "step": 451}
+{"train_info/time_between_train_steps": 0.005491018295288086, "step": 451}
+{"info/global_step": 452, "train_info/time_within_train_step": 27.310898065567017, "step": 452}
+{"train_info/time_between_train_steps": 0.005610227584838867, "step": 452}
+{"info/global_step": 453, "train_info/time_within_train_step": 27.33914542198181, "step": 453}
+{"train_info/time_between_train_steps": 0.005529642105102539, "step": 453}
+{"info/global_step": 454, "train_info/time_within_train_step": 27.39768886566162, "step": 454}
+{"train_info/time_between_train_steps": 0.012534141540527344, "step": 454}
+{"info/global_step": 455, "train_info/time_within_train_step": 27.32167959213257, "step": 455}
+{"train_info/time_between_train_steps": 0.005671024322509766, "step": 455}
+{"info/global_step": 456, "train_info/time_within_train_step": 27.241485595703125, "step": 456}
+{"train_info/time_between_train_steps": 0.005723237991333008, "step": 456}
+{"info/global_step": 457, "train_info/time_within_train_step": 27.241865158081055, "step": 457}
+{"train_info/time_between_train_steps": 0.005604982376098633, "step": 457}
+{"info/global_step": 458, "train_info/time_within_train_step": 27.292434692382812, "step": 458}
+{"train_info/time_between_train_steps": 0.005697011947631836, "step": 458}
+{"info/global_step": 459, "train_info/time_within_train_step": 27.406586408615112, "step": 459}
+{"train_info/time_between_train_steps": 0.006293773651123047, "step": 459}
+{"train_info/time_between_train_steps": 12.977145433425903, "step": 459}
+{"info/global_step": 460, "train_info/time_within_train_step": 27.311132669448853, "step": 460}
+{"train_info/time_between_train_steps": 0.0058557987213134766, "step": 460}
+{"info/global_step": 461, "train_info/time_within_train_step": 27.39264416694641, "step": 461}
+{"train_info/time_between_train_steps": 0.005805492401123047, "step": 461}
+{"info/global_step": 462, "train_info/time_within_train_step": 27.285706281661987, "step": 462}
+{"train_info/time_between_train_steps": 0.005481243133544922, "step": 462}
+{"info/global_step": 463, "train_info/time_within_train_step": 27.623825311660767, "step": 463}
+{"train_info/time_between_train_steps": 0.005776882171630859, "step": 463}
+{"info/global_step": 464, "train_info/time_within_train_step": 27.467833042144775, "step": 464}
+{"train_info/time_between_train_steps": 0.010886907577514648, "step": 464}
+{"info/global_step": 465, "train_info/time_within_train_step": 27.460777282714844, "step": 465}
+{"train_info/time_between_train_steps": 0.00557398796081543, "step": 465}
+{"info/global_step": 466, "train_info/time_within_train_step": 27.424190044403076, "step": 466}
+{"train_info/time_between_train_steps": 0.005922794342041016, "step": 466}
+{"info/global_step": 467, "train_info/time_within_train_step": 27.459566831588745, "step": 467}
+{"train_info/time_between_train_steps": 0.0055446624755859375, "step": 467}
+{"info/global_step": 468, "train_info/time_within_train_step": 27.47495436668396, "step": 468}
+{"train_info/time_between_train_steps": 0.005521059036254883, "step": 468}
+{"info/global_step": 469, "train_info/time_within_train_step": 27.280802249908447, "step": 469}
+{"train_info/time_between_train_steps": 0.005437374114990234, "step": 469}
+{"info/global_step": 470, "train_info/time_within_train_step": 27.380991220474243, "step": 470}
+{"train_info/time_between_train_steps": 0.00542902946472168, "step": 470}
+{"info/global_step": 471, "train_info/time_within_train_step": 27.353211641311646, "step": 471}
+{"train_info/time_between_train_steps": 0.005442619323730469, "step": 471}
+{"info/global_step": 472, "train_info/time_within_train_step": 27.47597312927246, "step": 472}
+{"train_info/time_between_train_steps": 0.005683183670043945, "step": 472}
+{"info/global_step": 473, "train_info/time_within_train_step": 27.295644998550415, "step": 473}
+{"train_info/time_between_train_steps": 0.01082611083984375, "step": 473}
+{"info/global_step": 474, "train_info/time_within_train_step": 27.31233859062195, "step": 474}
+{"train_info/time_between_train_steps": 0.005454063415527344, "step": 474}
+{"info/global_step": 475, "train_info/time_within_train_step": 27.304818630218506, "step": 475}
+{"train_info/time_between_train_steps": 0.005460500717163086, "step": 475}
+{"info/global_step": 476, "train_info/time_within_train_step": 27.226924896240234, "step": 476}
+{"train_info/time_between_train_steps": 0.005532979965209961, "step": 476}
+{"info/global_step": 477, "train_info/time_within_train_step": 27.24191427230835, "step": 477}
+{"train_info/time_between_train_steps": 0.005452871322631836, "step": 477}
+{"info/global_step": 478, "train_info/time_within_train_step": 27.223898887634277, "step": 478}
+{"train_info/time_between_train_steps": 0.005465030670166016, "step": 478}
+{"info/global_step": 479, "train_info/time_within_train_step": 27.387126922607422, "step": 479}
+{"train_info/time_between_train_steps": 0.005356550216674805, "step": 479}
+{"info/global_step": 480, "train_info/time_within_train_step": 27.277735948562622, "step": 480}
+{"train_info/time_between_train_steps": 0.0054662227630615234, "step": 480}
+{"info/global_step": 481, "train_info/time_within_train_step": 27.282219171524048, "step": 481}
+{"train_info/time_between_train_steps": 0.005494117736816406, "step": 481}
+{"info/global_step": 482, "train_info/time_within_train_step": 27.259634017944336, "step": 482}
+{"train_info/time_between_train_steps": 0.005529642105102539, "step": 482}
+{"info/global_step": 483, "train_info/time_within_train_step": 27.269327402114868, "step": 483}
+{"train_info/time_between_train_steps": 0.005703449249267578, "step": 483}
+{"info/global_step": 484, "train_info/time_within_train_step": 27.323360204696655, "step": 484}
+{"train_info/time_between_train_steps": 0.005602359771728516, "step": 484}
+{"info/global_step": 485, "train_info/time_within_train_step": 27.34005641937256, "step": 485}
+{"train_info/time_between_train_steps": 0.009088993072509766, "step": 485}
+{"info/global_step": 486, "train_info/time_within_train_step": 27.380899667739868, "step": 486}
+{"train_info/time_between_train_steps": 0.006244659423828125, "step": 486}
+{"train_info/time_between_train_steps": 13.161745071411133, "step": 486}
+{"info/global_step": 487, "train_info/time_within_train_step": 27.30722403526306, "step": 487}
+{"train_info/time_between_train_steps": 0.009331941604614258, "step": 487}
+{"info/global_step": 488, "train_info/time_within_train_step": 27.70785689353943, "step": 488}
+{"train_info/time_between_train_steps": 0.005614757537841797, "step": 488}
+{"info/global_step": 489, "train_info/time_within_train_step": 27.435399055480957, "step": 489}
+{"train_info/time_between_train_steps": 0.009098529815673828, "step": 489}
+{"info/global_step": 490, "train_info/time_within_train_step": 27.529902696609497, "step": 490}
+{"train_info/time_between_train_steps": 0.005832672119140625, "step": 490}
+{"info/global_step": 491, "train_info/time_within_train_step": 27.316604614257812, "step": 491}
+{"train_info/time_between_train_steps": 0.012188434600830078, "step": 491}
+{"info/global_step": 492, "train_info/time_within_train_step": 27.570700883865356, "step": 492}
+{"train_info/time_between_train_steps": 0.005793094635009766, "step": 492}
+{"info/global_step": 493, "train_info/time_within_train_step": 27.361358404159546, "step": 493}
+{"train_info/time_between_train_steps": 0.008625984191894531, "step": 493}
+{"info/global_step": 494, "train_info/time_within_train_step": 27.454140186309814, "step": 494}
+{"train_info/time_between_train_steps": 0.005259037017822266, "step": 494}
+{"info/global_step": 495, "train_info/time_within_train_step": 27.25786852836609, "step": 495}
+{"train_info/time_between_train_steps": 0.0054705142974853516, "step": 495}
+{"info/global_step": 496, "train_info/time_within_train_step": 27.364845037460327, "step": 496}
+{"train_info/time_between_train_steps": 0.005374908447265625, "step": 496}
+{"info/global_step": 497, "train_info/time_within_train_step": 27.37478995323181, "step": 497}
+{"train_info/time_between_train_steps": 0.00540924072265625, "step": 497}
+{"info/global_step": 498, "train_info/time_within_train_step": 27.224589824676514, "step": 498}
+{"train_info/time_between_train_steps": 0.005507707595825195, "step": 498}
+{"info/global_step": 499, "train_info/time_within_train_step": 27.34480857849121, "step": 499}
+{"train_info/time_between_train_steps": 0.005693197250366211, "step": 499}
+{"info/global_step": 500, "train_info/time_within_train_step": 27.288265705108643, "step": 500}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734026495, "_runtime": 16946}, "step": 500}
+{"logs": {"train/loss": 4.4024, "train/learning_rate": 0.00038888888888888887, "train/epoch": 18.01, "_timestamp": 1734026495, "_runtime": 16946}, "step": 500}
+{"train_info/time_between_train_steps": 2.4482736587524414, "step": 500}
+{"info/global_step": 501, "train_info/time_within_train_step": 27.30676031112671, "step": 501}
+{"train_info/time_between_train_steps": 0.005380153656005859, "step": 501}
+{"info/global_step": 502, "train_info/time_within_train_step": 27.519262313842773, "step": 502}
+{"train_info/time_between_train_steps": 0.005234241485595703, "step": 502}
+{"info/global_step": 503, "train_info/time_within_train_step": 27.525526523590088, "step": 503}
+{"train_info/time_between_train_steps": 0.006452322006225586, "step": 503}
+{"info/global_step": 504, "train_info/time_within_train_step": 27.378526210784912, "step": 504}
+{"train_info/time_between_train_steps": 0.010275602340698242, "step": 504}
+{"info/global_step": 505, "train_info/time_within_train_step": 27.301759004592896, "step": 505}
+{"train_info/time_between_train_steps": 0.00568699836730957, "step": 505}
+{"info/global_step": 506, "train_info/time_within_train_step": 27.305496215820312, "step": 506}
+{"train_info/time_between_train_steps": 0.00527501106262207, "step": 506}
+{"info/global_step": 507, "train_info/time_within_train_step": 27.439831495285034, "step": 507}
+{"train_info/time_between_train_steps": 0.00533747673034668, "step": 507}
+{"info/global_step": 508, "train_info/time_within_train_step": 27.407270193099976, "step": 508}
+{"train_info/time_between_train_steps": 0.01714158058166504, "step": 508}
+{"info/global_step": 509, "train_info/time_within_train_step": 27.47842502593994, "step": 509}
+{"train_info/time_between_train_steps": 0.008681535720825195, "step": 509}
+{"info/global_step": 510, "train_info/time_within_train_step": 27.394446849822998, "step": 510}
+{"train_info/time_between_train_steps": 0.005408525466918945, "step": 510}
+{"info/global_step": 511, "train_info/time_within_train_step": 27.284421920776367, "step": 511}
+{"train_info/time_between_train_steps": 0.012503862380981445, "step": 511}
+{"info/global_step": 512, "train_info/time_within_train_step": 27.44820523262024, "step": 512}
+{"train_info/time_between_train_steps": 0.005690813064575195, "step": 512}
+{"info/global_step": 513, "train_info/time_within_train_step": 27.41025948524475, "step": 513}
+{"train_info/time_between_train_steps": 0.00981593132019043, "step": 513}
+{"train_info/time_between_train_steps": 13.341846466064453, "step": 513}
+{"info/global_step": 514, "train_info/time_within_train_step": 27.307711124420166, "step": 514}
+{"train_info/time_between_train_steps": 0.005297422409057617, "step": 514}
+{"info/global_step": 515, "train_info/time_within_train_step": 27.382830142974854, "step": 515}
+{"train_info/time_between_train_steps": 0.005719900131225586, "step": 515}
+{"info/global_step": 516, "train_info/time_within_train_step": 27.339378595352173, "step": 516}
+{"train_info/time_between_train_steps": 0.00529789924621582, "step": 516}
+{"info/global_step": 517, "train_info/time_within_train_step": 27.55426573753357, "step": 517}
+{"train_info/time_between_train_steps": 0.005452632904052734, "step": 517}
+{"info/global_step": 518, "train_info/time_within_train_step": 27.25271201133728, "step": 518}
+{"train_info/time_between_train_steps": 0.0057239532470703125, "step": 518}
+{"info/global_step": 519, "train_info/time_within_train_step": 27.53172993659973, "step": 519}
+{"train_info/time_between_train_steps": 0.0053942203521728516, "step": 519}
+{"info/global_step": 520, "train_info/time_within_train_step": 27.26256561279297, "step": 520}
+{"train_info/time_between_train_steps": 0.00538182258605957, "step": 520}
+{"info/global_step": 521, "train_info/time_within_train_step": 27.373138904571533, "step": 521}
+{"train_info/time_between_train_steps": 0.005136251449584961, "step": 521}
+{"info/global_step": 522, "train_info/time_within_train_step": 27.229695558547974, "step": 522}
+{"train_info/time_between_train_steps": 0.005351066589355469, "step": 522}
+{"info/global_step": 523, "train_info/time_within_train_step": 27.21516180038452, "step": 523}
+{"train_info/time_between_train_steps": 0.005151271820068359, "step": 523}
+{"info/global_step": 524, "train_info/time_within_train_step": 27.24782967567444, "step": 524}
+{"train_info/time_between_train_steps": 0.0051648616790771484, "step": 524}
+{"info/global_step": 525, "train_info/time_within_train_step": 27.341725826263428, "step": 525}
+{"train_info/time_between_train_steps": 0.005293130874633789, "step": 525}
+{"info/global_step": 526, "train_info/time_within_train_step": 27.358843088150024, "step": 526}
+{"train_info/time_between_train_steps": 0.005529165267944336, "step": 526}
+{"info/global_step": 527, "train_info/time_within_train_step": 27.335044384002686, "step": 527}
+{"train_info/time_between_train_steps": 0.009613752365112305, "step": 527}
+{"info/global_step": 528, "train_info/time_within_train_step": 27.444290161132812, "step": 528}
+{"train_info/time_between_train_steps": 0.009955883026123047, "step": 528}
+{"info/global_step": 529, "train_info/time_within_train_step": 27.289405345916748, "step": 529}
+{"train_info/time_between_train_steps": 0.009204626083374023, "step": 529}
+{"info/global_step": 530, "train_info/time_within_train_step": 27.37393093109131, "step": 530}
+{"train_info/time_between_train_steps": 0.005273342132568359, "step": 530}
+{"info/global_step": 531, "train_info/time_within_train_step": 27.296684980392456, "step": 531}
+{"train_info/time_between_train_steps": 0.005280494689941406, "step": 531}
+{"info/global_step": 532, "train_info/time_within_train_step": 27.412482500076294, "step": 532}
+{"train_info/time_between_train_steps": 0.005432844161987305, "step": 532}
+{"info/global_step": 533, "train_info/time_within_train_step": 27.304457426071167, "step": 533}
+{"train_info/time_between_train_steps": 0.010332345962524414, "step": 533}
+{"info/global_step": 534, "train_info/time_within_train_step": 27.34559440612793, "step": 534}
+{"train_info/time_between_train_steps": 0.005688190460205078, "step": 534}
+{"info/global_step": 535, "train_info/time_within_train_step": 27.370164155960083, "step": 535}
+{"train_info/time_between_train_steps": 0.005547046661376953, "step": 535}
+{"info/global_step": 536, "train_info/time_within_train_step": 27.304182767868042, "step": 536}
+{"train_info/time_between_train_steps": 0.0055201053619384766, "step": 536}
+{"info/global_step": 537, "train_info/time_within_train_step": 27.402703523635864, "step": 537}
+{"train_info/time_between_train_steps": 0.005670785903930664, "step": 537}
+{"info/global_step": 538, "train_info/time_within_train_step": 27.449815273284912, "step": 538}
+{"train_info/time_between_train_steps": 0.0057697296142578125, "step": 538}
+{"info/global_step": 539, "train_info/time_within_train_step": 27.407392501831055, "step": 539}
+{"train_info/time_between_train_steps": 0.009063243865966797, "step": 539}
+{"info/global_step": 540, "train_info/time_within_train_step": 27.401968479156494, "step": 540}
+{"train_info/time_between_train_steps": 0.006811857223510742, "step": 540}
+{"train_info/time_between_train_steps": 13.162440776824951, "step": 540}
+{"info/global_step": 541, "train_info/time_within_train_step": 27.2749183177948, "step": 541}
+{"train_info/time_between_train_steps": 0.006440401077270508, "step": 541}
+{"info/global_step": 542, "train_info/time_within_train_step": 27.541569471359253, "step": 542}
+{"train_info/time_between_train_steps": 0.005766868591308594, "step": 542}
+{"info/global_step": 543, "train_info/time_within_train_step": 27.32215929031372, "step": 543}
+{"train_info/time_between_train_steps": 0.012711763381958008, "step": 543}
+{"info/global_step": 544, "train_info/time_within_train_step": 27.57770299911499, "step": 544}
+{"train_info/time_between_train_steps": 0.010326623916625977, "step": 544}
+{"info/global_step": 545, "train_info/time_within_train_step": 27.48125457763672, "step": 545}
+{"train_info/time_between_train_steps": 0.008662223815917969, "step": 545}
+{"info/global_step": 546, "train_info/time_within_train_step": 27.48108673095703, "step": 546}
+{"train_info/time_between_train_steps": 0.005597829818725586, "step": 546}
+{"info/global_step": 547, "train_info/time_within_train_step": 27.40508723258972, "step": 547}
+{"train_info/time_between_train_steps": 0.010978221893310547, "step": 547}
+{"info/global_step": 548, "train_info/time_within_train_step": 27.32008123397827, "step": 548}
+{"train_info/time_between_train_steps": 0.0053255558013916016, "step": 548}
+{"info/global_step": 549, "train_info/time_within_train_step": 27.306559562683105, "step": 549}
+{"train_info/time_between_train_steps": 0.005322933197021484, "step": 549}
+{"info/global_step": 550, "train_info/time_within_train_step": 27.261659622192383, "step": 550}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734027893, "_runtime": 18344}, "step": 550}
+{"logs": {"train/loss": 4.2409, "train/learning_rate": 0.0003611111111111111, "train/epoch": 20.01, "_timestamp": 1734027893, "_runtime": 18344}, "step": 550}
+{"train_info/time_between_train_steps": 0.007494211196899414, "step": 550}
+{"info/global_step": 551, "train_info/time_within_train_step": 27.284637689590454, "step": 551}
+{"train_info/time_between_train_steps": 0.005273103713989258, "step": 551}
+{"info/global_step": 552, "train_info/time_within_train_step": 27.29991126060486, "step": 552}
+{"train_info/time_between_train_steps": 0.005403757095336914, "step": 552}
+{"info/global_step": 553, "train_info/time_within_train_step": 27.220021963119507, "step": 553}
+{"train_info/time_between_train_steps": 0.00520777702331543, "step": 553}
+{"info/global_step": 554, "train_info/time_within_train_step": 27.298094272613525, "step": 554}
+{"train_info/time_between_train_steps": 0.0054514408111572266, "step": 554}
+{"info/global_step": 555, "train_info/time_within_train_step": 27.237106561660767, "step": 555}
+{"train_info/time_between_train_steps": 0.010065555572509766, "step": 555}
+{"info/global_step": 556, "train_info/time_within_train_step": 27.596240282058716, "step": 556}
+{"train_info/time_between_train_steps": 0.011492252349853516, "step": 556}
+{"info/global_step": 557, "train_info/time_within_train_step": 27.30705451965332, "step": 557}
+{"train_info/time_between_train_steps": 0.0055141448974609375, "step": 557}
+{"info/global_step": 558, "train_info/time_within_train_step": 27.31126308441162, "step": 558}
+{"train_info/time_between_train_steps": 0.00547337532043457, "step": 558}
+{"info/global_step": 559, "train_info/time_within_train_step": 27.229405641555786, "step": 559}
+{"train_info/time_between_train_steps": 0.012046575546264648, "step": 559}
+{"info/global_step": 560, "train_info/time_within_train_step": 27.3812575340271, "step": 560}
+{"train_info/time_between_train_steps": 0.005499362945556641, "step": 560}
+{"info/global_step": 561, "train_info/time_within_train_step": 27.27327871322632, "step": 561}
+{"train_info/time_between_train_steps": 0.005834817886352539, "step": 561}
+{"info/global_step": 562, "train_info/time_within_train_step": 27.339391946792603, "step": 562}
+{"train_info/time_between_train_steps": 0.005608320236206055, "step": 562}
+{"info/global_step": 563, "train_info/time_within_train_step": 27.380497455596924, "step": 563}
+{"train_info/time_between_train_steps": 0.010220527648925781, "step": 563}
+{"info/global_step": 564, "train_info/time_within_train_step": 27.428683757781982, "step": 564}
+{"train_info/time_between_train_steps": 0.010589361190795898, "step": 564}
+{"info/global_step": 565, "train_info/time_within_train_step": 27.434274435043335, "step": 565}
+{"train_info/time_between_train_steps": 0.005951642990112305, "step": 565}
+{"info/global_step": 566, "train_info/time_within_train_step": 27.36767864227295, "step": 566}
+{"train_info/time_between_train_steps": 0.005780220031738281, "step": 566}
+{"info/global_step": 567, "train_info/time_within_train_step": 27.379379749298096, "step": 567}
+{"train_info/time_between_train_steps": 0.009985685348510742, "step": 567}
+{"train_info/time_between_train_steps": 13.122570753097534, "step": 567}
+{"info/global_step": 568, "train_info/time_within_train_step": 27.321082592010498, "step": 568}
+{"train_info/time_between_train_steps": 0.005839824676513672, "step": 568}
+{"info/global_step": 569, "train_info/time_within_train_step": 27.43750238418579, "step": 569}
+{"train_info/time_between_train_steps": 0.005573272705078125, "step": 569}
+{"info/global_step": 570, "train_info/time_within_train_step": 27.265647888183594, "step": 570}
+{"train_info/time_between_train_steps": 0.005445718765258789, "step": 570}
+{"info/global_step": 571, "train_info/time_within_train_step": 27.63412857055664, "step": 571}
+{"train_info/time_between_train_steps": 0.012450456619262695, "step": 571}
+{"info/global_step": 572, "train_info/time_within_train_step": 27.50693130493164, "step": 572}
+{"train_info/time_between_train_steps": 0.005991697311401367, "step": 572}
+{"info/global_step": 573, "train_info/time_within_train_step": 27.51935052871704, "step": 573}
+{"train_info/time_between_train_steps": 0.012131690979003906, "step": 573}
+{"info/global_step": 574, "train_info/time_within_train_step": 27.2771418094635, "step": 574}
+{"train_info/time_between_train_steps": 0.013155460357666016, "step": 574}
+{"info/global_step": 575, "train_info/time_within_train_step": 27.63357925415039, "step": 575}
+{"train_info/time_between_train_steps": 0.013143062591552734, "step": 575}
+{"info/global_step": 576, "train_info/time_within_train_step": 27.489830493927002, "step": 576}
+{"train_info/time_between_train_steps": 0.010477542877197266, "step": 576}
+{"info/global_step": 577, "train_info/time_within_train_step": 27.454821825027466, "step": 577}
+{"train_info/time_between_train_steps": 0.005495548248291016, "step": 577}
+{"info/global_step": 578, "train_info/time_within_train_step": 27.358912706375122, "step": 578}
+{"train_info/time_between_train_steps": 0.005455732345581055, "step": 578}
+{"info/global_step": 579, "train_info/time_within_train_step": 27.41166925430298, "step": 579}
+{"train_info/time_between_train_steps": 0.005345582962036133, "step": 579}
+{"info/global_step": 580, "train_info/time_within_train_step": 27.24946928024292, "step": 580}
+{"train_info/time_between_train_steps": 0.005270242691040039, "step": 580}
+{"info/global_step": 581, "train_info/time_within_train_step": 27.25812554359436, "step": 581}
+{"train_info/time_between_train_steps": 0.0052852630615234375, "step": 581}
+{"info/global_step": 582, "train_info/time_within_train_step": 27.24450993537903, "step": 582}
+{"train_info/time_between_train_steps": 0.0054209232330322266, "step": 582}
+{"info/global_step": 583, "train_info/time_within_train_step": 27.30336356163025, "step": 583}
+{"train_info/time_between_train_steps": 0.005177497863769531, "step": 583}
+{"info/global_step": 584, "train_info/time_within_train_step": 27.23214554786682, "step": 584}
+{"train_info/time_between_train_steps": 0.005290985107421875, "step": 584}
+{"info/global_step": 585, "train_info/time_within_train_step": 27.35131287574768, "step": 585}
+{"train_info/time_between_train_steps": 0.010186433792114258, "step": 585}
+{"info/global_step": 586, "train_info/time_within_train_step": 27.378988027572632, "step": 586}
+{"train_info/time_between_train_steps": 0.008596658706665039, "step": 586}
+{"info/global_step": 587, "train_info/time_within_train_step": 27.56644606590271, "step": 587}
+{"train_info/time_between_train_steps": 0.008815288543701172, "step": 587}
+{"info/global_step": 588, "train_info/time_within_train_step": 27.364354848861694, "step": 588}
+{"train_info/time_between_train_steps": 0.0051593780517578125, "step": 588}
+{"info/global_step": 589, "train_info/time_within_train_step": 27.292964935302734, "step": 589}
+{"train_info/time_between_train_steps": 0.005358457565307617, "step": 589}
+{"info/global_step": 590, "train_info/time_within_train_step": 27.314754724502563, "step": 590}
+{"train_info/time_between_train_steps": 0.005399942398071289, "step": 590}
+{"info/global_step": 591, "train_info/time_within_train_step": 27.3366916179657, "step": 591}
+{"train_info/time_between_train_steps": 0.006475925445556641, "step": 591}
+{"info/global_step": 592, "train_info/time_within_train_step": 27.364238023757935, "step": 592}
+{"train_info/time_between_train_steps": 0.005440235137939453, "step": 592}
+{"info/global_step": 593, "train_info/time_within_train_step": 27.287744998931885, "step": 593}
+{"train_info/time_between_train_steps": 0.005569934844970703, "step": 593}
+{"info/global_step": 594, "train_info/time_within_train_step": 27.55386519432068, "step": 594}
+{"train_info/time_between_train_steps": 0.010159015655517578, "step": 594}
+{"train_info/time_between_train_steps": 13.175945520401001, "step": 594}
+{"info/global_step": 595, "train_info/time_within_train_step": 27.401276350021362, "step": 595}
+{"train_info/time_between_train_steps": 0.005108833312988281, "step": 595}
+{"info/global_step": 596, "train_info/time_within_train_step": 27.49612259864807, "step": 596}
+{"train_info/time_between_train_steps": 0.0054836273193359375, "step": 596}
+{"info/global_step": 597, "train_info/time_within_train_step": 27.467018127441406, "step": 597}
+{"train_info/time_between_train_steps": 0.009951114654541016, "step": 597}
+{"info/global_step": 598, "train_info/time_within_train_step": 27.645838260650635, "step": 598}
+{"train_info/time_between_train_steps": 0.005528450012207031, "step": 598}
+{"info/global_step": 599, "train_info/time_within_train_step": 27.42607593536377, "step": 599}
+{"train_info/time_between_train_steps": 0.010416269302368164, "step": 599}
+{"info/global_step": 600, "train_info/time_within_train_step": 27.577667951583862, "step": 600}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734029289, "_runtime": 19740}, "step": 600}
+{"logs": {"train/loss": 4.1107, "train/learning_rate": 0.0003333333333333333, "train/epoch": 22.0, "_timestamp": 1734029289, "_runtime": 19740}, "step": 600}
+{"train_info/time_between_train_steps": 2.380500078201294, "step": 600}
+{"info/global_step": 601, "train_info/time_within_train_step": 27.382681369781494, "step": 601}
+{"train_info/time_between_train_steps": 0.005861997604370117, "step": 601}
+{"info/global_step": 602, "train_info/time_within_train_step": 27.49170708656311, "step": 602}
+{"train_info/time_between_train_steps": 0.0053708553314208984, "step": 602}
+{"info/global_step": 603, "train_info/time_within_train_step": 27.221187353134155, "step": 603}
+{"train_info/time_between_train_steps": 0.005278825759887695, "step": 603}
+{"info/global_step": 604, "train_info/time_within_train_step": 27.225780963897705, "step": 604}
+{"train_info/time_between_train_steps": 0.005360841751098633, "step": 604}
+{"info/global_step": 605, "train_info/time_within_train_step": 27.22366690635681, "step": 605}
+{"train_info/time_between_train_steps": 0.005347013473510742, "step": 605}
+{"info/global_step": 606, "train_info/time_within_train_step": 27.39559769630432, "step": 606}
+{"train_info/time_between_train_steps": 0.008570432662963867, "step": 606}
+{"info/global_step": 607, "train_info/time_within_train_step": 27.343212366104126, "step": 607}
+{"train_info/time_between_train_steps": 0.00531315803527832, "step": 607}
+{"info/global_step": 608, "train_info/time_within_train_step": 27.296834707260132, "step": 608}
+{"train_info/time_between_train_steps": 0.005140781402587891, "step": 608}
+{"info/global_step": 609, "train_info/time_within_train_step": 27.20853304862976, "step": 609}
+{"train_info/time_between_train_steps": 0.005391597747802734, "step": 609}
+{"info/global_step": 610, "train_info/time_within_train_step": 27.339421272277832, "step": 610}
+{"train_info/time_between_train_steps": 0.01004934310913086, "step": 610}
+{"info/global_step": 611, "train_info/time_within_train_step": 27.391316413879395, "step": 611}
+{"train_info/time_between_train_steps": 0.010023355484008789, "step": 611}
+{"info/global_step": 612, "train_info/time_within_train_step": 27.510843992233276, "step": 612}
+{"train_info/time_between_train_steps": 0.0052721500396728516, "step": 612}
+{"info/global_step": 613, "train_info/time_within_train_step": 27.398359060287476, "step": 613}
+{"train_info/time_between_train_steps": 0.005419015884399414, "step": 613}
+{"info/global_step": 614, "train_info/time_within_train_step": 27.32690739631653, "step": 614}
+{"train_info/time_between_train_steps": 0.008224248886108398, "step": 614}
+{"info/global_step": 615, "train_info/time_within_train_step": 27.314568281173706, "step": 615}
+{"train_info/time_between_train_steps": 0.005427837371826172, "step": 615}
+{"info/global_step": 616, "train_info/time_within_train_step": 27.46548581123352, "step": 616}
+{"train_info/time_between_train_steps": 0.014750003814697266, "step": 616}
+{"info/global_step": 617, "train_info/time_within_train_step": 27.511061429977417, "step": 617}
+{"train_info/time_between_train_steps": 0.009661436080932617, "step": 617}
+{"info/global_step": 618, "train_info/time_within_train_step": 27.572118520736694, "step": 618}
+{"train_info/time_between_train_steps": 0.005337238311767578, "step": 618}
+{"info/global_step": 619, "train_info/time_within_train_step": 27.471014499664307, "step": 619}
+{"train_info/time_between_train_steps": 0.01921367645263672, "step": 619}
+{"info/global_step": 620, "train_info/time_within_train_step": 27.338131427764893, "step": 620}
+{"train_info/time_between_train_steps": 0.00536799430847168, "step": 620}
+{"info/global_step": 621, "train_info/time_within_train_step": 27.459688901901245, "step": 621}
+{"train_info/time_between_train_steps": 0.006602287292480469, "step": 621}
+{"train_info/time_between_train_steps": 12.826379299163818, "step": 621}
+{"info/global_step": 622, "train_info/time_within_train_step": 27.39739179611206, "step": 622}
+{"train_info/time_between_train_steps": 0.0057184696197509766, "step": 622}
+{"info/global_step": 623, "train_info/time_within_train_step": 27.371052980422974, "step": 623}
+{"train_info/time_between_train_steps": 0.005491971969604492, "step": 623}
+{"info/global_step": 624, "train_info/time_within_train_step": 27.315165042877197, "step": 624}
+{"train_info/time_between_train_steps": 0.00572657585144043, "step": 624}
+{"info/global_step": 625, "train_info/time_within_train_step": 27.505356073379517, "step": 625}
+{"train_info/time_between_train_steps": 0.00532078742980957, "step": 625}
+{"info/global_step": 626, "train_info/time_within_train_step": 27.241090774536133, "step": 626}
+{"train_info/time_between_train_steps": 0.005780935287475586, "step": 626}
+{"info/global_step": 627, "train_info/time_within_train_step": 27.42228674888611, "step": 627}
+{"train_info/time_between_train_steps": 0.005445718765258789, "step": 627}
+{"info/global_step": 628, "train_info/time_within_train_step": 27.241950511932373, "step": 628}
+{"train_info/time_between_train_steps": 0.005793571472167969, "step": 628}
+{"info/global_step": 629, "train_info/time_within_train_step": 27.283588647842407, "step": 629}
+{"train_info/time_between_train_steps": 0.005172252655029297, "step": 629}
+{"info/global_step": 630, "train_info/time_within_train_step": 27.224342346191406, "step": 630}
+{"train_info/time_between_train_steps": 0.011350154876708984, "step": 630}
+{"info/global_step": 631, "train_info/time_within_train_step": 27.38411831855774, "step": 631}
+{"train_info/time_between_train_steps": 0.0051729679107666016, "step": 631}
+{"info/global_step": 632, "train_info/time_within_train_step": 27.2655029296875, "step": 632}
+{"train_info/time_between_train_steps": 0.010348796844482422, "step": 632}
+{"info/global_step": 633, "train_info/time_within_train_step": 27.33819580078125, "step": 633}
+{"train_info/time_between_train_steps": 0.0053517818450927734, "step": 633}
+{"info/global_step": 634, "train_info/time_within_train_step": 27.41888737678528, "step": 634}
+{"train_info/time_between_train_steps": 0.009337425231933594, "step": 634}
+{"info/global_step": 635, "train_info/time_within_train_step": 27.567933082580566, "step": 635}
+{"train_info/time_between_train_steps": 0.009780168533325195, "step": 635}
+{"info/global_step": 636, "train_info/time_within_train_step": 27.3270845413208, "step": 636}
+{"train_info/time_between_train_steps": 0.008818626403808594, "step": 636}
+{"info/global_step": 637, "train_info/time_within_train_step": 27.361352682113647, "step": 637}
+{"train_info/time_between_train_steps": 0.008743762969970703, "step": 637}
+{"info/global_step": 638, "train_info/time_within_train_step": 27.30939769744873, "step": 638}
+{"train_info/time_between_train_steps": 0.005233287811279297, "step": 638}
+{"info/global_step": 639, "train_info/time_within_train_step": 27.26723575592041, "step": 639}
+{"train_info/time_between_train_steps": 0.005857944488525391, "step": 639}
+{"info/global_step": 640, "train_info/time_within_train_step": 27.283311367034912, "step": 640}
+{"train_info/time_between_train_steps": 0.01018381118774414, "step": 640}
+{"info/global_step": 641, "train_info/time_within_train_step": 27.33807682991028, "step": 641}
+{"train_info/time_between_train_steps": 0.0053822994232177734, "step": 641}
+{"info/global_step": 642, "train_info/time_within_train_step": 27.287174463272095, "step": 642}
+{"train_info/time_between_train_steps": 0.0054590702056884766, "step": 642}
+{"info/global_step": 643, "train_info/time_within_train_step": 27.286043167114258, "step": 643}
+{"train_info/time_between_train_steps": 0.006417989730834961, "step": 643}
+{"info/global_step": 644, "train_info/time_within_train_step": 27.579944610595703, "step": 644}
+{"train_info/time_between_train_steps": 0.009999275207519531, "step": 644}
+{"info/global_step": 645, "train_info/time_within_train_step": 27.40441918373108, "step": 645}
+{"train_info/time_between_train_steps": 0.015009403228759766, "step": 645}
+{"info/global_step": 646, "train_info/time_within_train_step": 27.31067705154419, "step": 646}
+{"train_info/time_between_train_steps": 0.0057294368743896484, "step": 646}
+{"info/global_step": 647, "train_info/time_within_train_step": 27.23369860649109, "step": 647}
+{"train_info/time_between_train_steps": 0.005627155303955078, "step": 647}
+{"info/global_step": 648, "train_info/time_within_train_step": 27.396483421325684, "step": 648}
+{"train_info/time_between_train_steps": 0.006153583526611328, "step": 648}
+{"train_info/time_between_train_steps": 12.9098961353302, "step": 648}
+{"info/global_step": 649, "train_info/time_within_train_step": 27.3397855758667, "step": 649}
+{"train_info/time_between_train_steps": 0.005721092224121094, "step": 649}
+{"info/global_step": 650, "train_info/time_within_train_step": 27.41430401802063, "step": 650}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734030686, "_runtime": 21137}, "step": 650}
+{"logs": {"train/loss": 3.9943, "train/learning_rate": 0.00030555555555555555, "train/epoch": 24.0, "_timestamp": 1734030686, "_runtime": 21137}, "step": 650}
+{"train_info/time_between_train_steps": 0.007317781448364258, "step": 650}
+{"info/global_step": 651, "train_info/time_within_train_step": 27.36816096305847, "step": 651}
+{"train_info/time_between_train_steps": 0.01648259162902832, "step": 651}
+{"info/global_step": 652, "train_info/time_within_train_step": 27.4704806804657, "step": 652}
+{"train_info/time_between_train_steps": 0.005456447601318359, "step": 652}
+{"info/global_step": 653, "train_info/time_within_train_step": 27.407975673675537, "step": 653}
+{"train_info/time_between_train_steps": 0.005881547927856445, "step": 653}
+{"info/global_step": 654, "train_info/time_within_train_step": 27.590009689331055, "step": 654}
+{"train_info/time_between_train_steps": 0.005507469177246094, "step": 654}
+{"info/global_step": 655, "train_info/time_within_train_step": 27.34368586540222, "step": 655}
+{"train_info/time_between_train_steps": 0.005713462829589844, "step": 655}
+{"info/global_step": 656, "train_info/time_within_train_step": 27.489287614822388, "step": 656}
+{"train_info/time_between_train_steps": 0.005331993103027344, "step": 656}
+{"info/global_step": 657, "train_info/time_within_train_step": 27.20800232887268, "step": 657}
+{"train_info/time_between_train_steps": 0.005242347717285156, "step": 657}
+{"info/global_step": 658, "train_info/time_within_train_step": 27.37728786468506, "step": 658}
+{"train_info/time_between_train_steps": 0.006761312484741211, "step": 658}
+{"info/global_step": 659, "train_info/time_within_train_step": 27.390652656555176, "step": 659}
+{"train_info/time_between_train_steps": 0.00996851921081543, "step": 659}
+{"info/global_step": 660, "train_info/time_within_train_step": 27.333547115325928, "step": 660}
+{"train_info/time_between_train_steps": 0.0055141448974609375, "step": 660}
+{"info/global_step": 661, "train_info/time_within_train_step": 27.324825048446655, "step": 661}
+{"train_info/time_between_train_steps": 0.005283832550048828, "step": 661}
+{"info/global_step": 662, "train_info/time_within_train_step": 27.360974073410034, "step": 662}
+{"train_info/time_between_train_steps": 0.0054552555084228516, "step": 662}
+{"info/global_step": 663, "train_info/time_within_train_step": 27.33496904373169, "step": 663}
+{"train_info/time_between_train_steps": 0.005480289459228516, "step": 663}
+{"info/global_step": 664, "train_info/time_within_train_step": 27.413931369781494, "step": 664}
+{"train_info/time_between_train_steps": 0.0052525997161865234, "step": 664}
+{"info/global_step": 665, "train_info/time_within_train_step": 27.2859365940094, "step": 665}
+{"train_info/time_between_train_steps": 0.005336284637451172, "step": 665}
+{"info/global_step": 666, "train_info/time_within_train_step": 27.380587339401245, "step": 666}
+{"train_info/time_between_train_steps": 0.005499839782714844, "step": 666}
+{"info/global_step": 667, "train_info/time_within_train_step": 27.33383846282959, "step": 667}
+{"train_info/time_between_train_steps": 0.00961446762084961, "step": 667}
+{"info/global_step": 668, "train_info/time_within_train_step": 27.39219331741333, "step": 668}
+{"train_info/time_between_train_steps": 0.005327463150024414, "step": 668}
+{"info/global_step": 669, "train_info/time_within_train_step": 27.463061571121216, "step": 669}
+{"train_info/time_between_train_steps": 0.014713048934936523, "step": 669}
+{"info/global_step": 670, "train_info/time_within_train_step": 27.457165479660034, "step": 670}
+{"train_info/time_between_train_steps": 0.00861668586730957, "step": 670}
+{"info/global_step": 671, "train_info/time_within_train_step": 27.381406545639038, "step": 671}
+{"train_info/time_between_train_steps": 0.005346536636352539, "step": 671}
+{"info/global_step": 672, "train_info/time_within_train_step": 27.25373673439026, "step": 672}
+{"train_info/time_between_train_steps": 0.005353450775146484, "step": 672}
+{"info/global_step": 673, "train_info/time_within_train_step": 27.325409412384033, "step": 673}
+{"train_info/time_between_train_steps": 0.005636930465698242, "step": 673}
+{"info/global_step": 674, "train_info/time_within_train_step": 27.29370903968811, "step": 674}
+{"train_info/time_between_train_steps": 0.012260675430297852, "step": 674}
+{"info/global_step": 675, "train_info/time_within_train_step": 27.313000679016113, "step": 675}
+{"train_info/time_between_train_steps": 0.006573677062988281, "step": 675}
+{"train_info/time_between_train_steps": 13.128698825836182, "step": 675}
+{"info/global_step": 676, "train_info/time_within_train_step": 27.254109859466553, "step": 676}
+{"train_info/time_between_train_steps": 0.00541996955871582, "step": 676}
+{"info/global_step": 677, "train_info/time_within_train_step": 27.515980005264282, "step": 677}
+{"train_info/time_between_train_steps": 0.005309343338012695, "step": 677}
+{"info/global_step": 678, "train_info/time_within_train_step": 27.30360507965088, "step": 678}
+{"train_info/time_between_train_steps": 0.008872270584106445, "step": 678}
+{"info/global_step": 679, "train_info/time_within_train_step": 27.561785221099854, "step": 679}
+{"train_info/time_between_train_steps": 0.010530948638916016, "step": 679}
+{"info/global_step": 680, "train_info/time_within_train_step": 27.39607048034668, "step": 680}
+{"train_info/time_between_train_steps": 0.010610342025756836, "step": 680}
+{"info/global_step": 681, "train_info/time_within_train_step": 27.614644765853882, "step": 681}
+{"train_info/time_between_train_steps": 0.0056231021881103516, "step": 681}
+{"info/global_step": 682, "train_info/time_within_train_step": 27.462139129638672, "step": 682}
+{"train_info/time_between_train_steps": 0.005319118499755859, "step": 682}
+{"info/global_step": 683, "train_info/time_within_train_step": 27.329195022583008, "step": 683}
+{"train_info/time_between_train_steps": 0.005245685577392578, "step": 683}
+{"info/global_step": 684, "train_info/time_within_train_step": 27.335938453674316, "step": 684}
+{"train_info/time_between_train_steps": 0.005257368087768555, "step": 684}
+{"info/global_step": 685, "train_info/time_within_train_step": 27.32485294342041, "step": 685}
+{"train_info/time_between_train_steps": 0.00864267349243164, "step": 685}
+{"info/global_step": 686, "train_info/time_within_train_step": 27.357212781906128, "step": 686}
+{"train_info/time_between_train_steps": 0.005140066146850586, "step": 686}
+{"info/global_step": 687, "train_info/time_within_train_step": 27.420847415924072, "step": 687}
+{"train_info/time_between_train_steps": 0.0123138427734375, "step": 687}
+{"info/global_step": 688, "train_info/time_within_train_step": 27.293599605560303, "step": 688}
+{"train_info/time_between_train_steps": 0.005101442337036133, "step": 688}
+{"info/global_step": 689, "train_info/time_within_train_step": 27.414894342422485, "step": 689}
+{"train_info/time_between_train_steps": 0.005120038986206055, "step": 689}
+{"info/global_step": 690, "train_info/time_within_train_step": 27.26794958114624, "step": 690}
+{"train_info/time_between_train_steps": 0.010641336441040039, "step": 690}
+{"info/global_step": 691, "train_info/time_within_train_step": 27.259514808654785, "step": 691}
+{"train_info/time_between_train_steps": 0.005298137664794922, "step": 691}
+{"info/global_step": 692, "train_info/time_within_train_step": 27.32599115371704, "step": 692}
+{"train_info/time_between_train_steps": 0.005203723907470703, "step": 692}
+{"info/global_step": 693, "train_info/time_within_train_step": 27.21751832962036, "step": 693}
+{"train_info/time_between_train_steps": 0.005133867263793945, "step": 693}
+{"info/global_step": 694, "train_info/time_within_train_step": 27.228198766708374, "step": 694}
+{"train_info/time_between_train_steps": 0.00517582893371582, "step": 694}
+{"info/global_step": 695, "train_info/time_within_train_step": 27.371982097625732, "step": 695}
+{"train_info/time_between_train_steps": 0.0052187442779541016, "step": 695}
+{"info/global_step": 696, "train_info/time_within_train_step": 27.369737148284912, "step": 696}
+{"train_info/time_between_train_steps": 0.009438514709472656, "step": 696}
+{"info/global_step": 697, "train_info/time_within_train_step": 27.354224681854248, "step": 697}
+{"train_info/time_between_train_steps": 0.005151987075805664, "step": 697}
+{"info/global_step": 698, "train_info/time_within_train_step": 27.222692012786865, "step": 698}
+{"train_info/time_between_train_steps": 0.005265712738037109, "step": 698}
+{"info/global_step": 699, "train_info/time_within_train_step": 27.33707594871521, "step": 699}
+{"train_info/time_between_train_steps": 0.0053408145904541016, "step": 699}
+{"info/global_step": 700, "train_info/time_within_train_step": 27.31895637512207, "step": 700}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734032067, "_runtime": 22518}, "step": 700}
+{"logs": {"train/loss": 3.864, "train/learning_rate": 0.0002777777777777778, "train/epoch": 25.02, "_timestamp": 1734032067, "_runtime": 22518}, "step": 700}
+{"train_info/time_between_train_steps": 2.290334463119507, "step": 700}
+{"info/global_step": 701, "train_info/time_within_train_step": 27.41163468360901, "step": 701}
+{"train_info/time_between_train_steps": 0.00930929183959961, "step": 701}
+{"info/global_step": 702, "train_info/time_within_train_step": 27.391586780548096, "step": 702}
+{"train_info/time_between_train_steps": 0.0059659481048583984, "step": 702}
+{"train_info/time_between_train_steps": 12.743213415145874, "step": 702}
+{"info/global_step": 703, "train_info/time_within_train_step": 27.370062589645386, "step": 703}
+{"train_info/time_between_train_steps": 0.005307674407958984, "step": 703}
+{"info/global_step": 704, "train_info/time_within_train_step": 27.492971897125244, "step": 704}
+{"train_info/time_between_train_steps": 0.005156278610229492, "step": 704}
+{"info/global_step": 705, "train_info/time_within_train_step": 27.460604667663574, "step": 705}
+{"train_info/time_between_train_steps": 0.0053517818450927734, "step": 705}
+{"info/global_step": 706, "train_info/time_within_train_step": 27.599412441253662, "step": 706}
+{"train_info/time_between_train_steps": 0.005273103713989258, "step": 706}
+{"info/global_step": 707, "train_info/time_within_train_step": 27.46350359916687, "step": 707}
+{"train_info/time_between_train_steps": 0.0055196285247802734, "step": 707}
+{"info/global_step": 708, "train_info/time_within_train_step": 27.642226934432983, "step": 708}
+{"train_info/time_between_train_steps": 0.010130882263183594, "step": 708}
+{"info/global_step": 709, "train_info/time_within_train_step": 27.362056016921997, "step": 709}
+{"train_info/time_between_train_steps": 0.005404949188232422, "step": 709}
+{"info/global_step": 710, "train_info/time_within_train_step": 27.426843643188477, "step": 710}
+{"train_info/time_between_train_steps": 0.005008697509765625, "step": 710}
+{"info/global_step": 711, "train_info/time_within_train_step": 27.360280752182007, "step": 711}
+{"train_info/time_between_train_steps": 0.009104728698730469, "step": 711}
+{"info/global_step": 712, "train_info/time_within_train_step": 27.298261165618896, "step": 712}
+{"train_info/time_between_train_steps": 0.005121707916259766, "step": 712}
+{"info/global_step": 713, "train_info/time_within_train_step": 27.237961053848267, "step": 713}
+{"train_info/time_between_train_steps": 0.005269289016723633, "step": 713}
+{"info/global_step": 714, "train_info/time_within_train_step": 27.213837385177612, "step": 714}
+{"train_info/time_between_train_steps": 0.005094289779663086, "step": 714}
+{"info/global_step": 715, "train_info/time_within_train_step": 27.20363426208496, "step": 715}
+{"train_info/time_between_train_steps": 0.0050661563873291016, "step": 715}
+{"info/global_step": 716, "train_info/time_within_train_step": 27.44127345085144, "step": 716}
+{"train_info/time_between_train_steps": 0.0050623416900634766, "step": 716}
+{"info/global_step": 717, "train_info/time_within_train_step": 27.320155382156372, "step": 717}
+{"train_info/time_between_train_steps": 0.005219221115112305, "step": 717}
+{"info/global_step": 718, "train_info/time_within_train_step": 27.265223264694214, "step": 718}
+{"train_info/time_between_train_steps": 0.009703874588012695, "step": 718}
+{"info/global_step": 719, "train_info/time_within_train_step": 27.340960264205933, "step": 719}
+{"train_info/time_between_train_steps": 0.0053195953369140625, "step": 719}
+{"info/global_step": 720, "train_info/time_within_train_step": 27.31173276901245, "step": 720}
+{"train_info/time_between_train_steps": 0.005269765853881836, "step": 720}
+{"info/global_step": 721, "train_info/time_within_train_step": 27.379525661468506, "step": 721}
+{"train_info/time_between_train_steps": 0.009811639785766602, "step": 721}
+{"info/global_step": 722, "train_info/time_within_train_step": 27.299065589904785, "step": 722}
+{"train_info/time_between_train_steps": 0.0052530765533447266, "step": 722}
+{"info/global_step": 723, "train_info/time_within_train_step": 27.33756160736084, "step": 723}
+{"train_info/time_between_train_steps": 0.005502223968505859, "step": 723}
+{"info/global_step": 724, "train_info/time_within_train_step": 27.32785725593567, "step": 724}
+{"train_info/time_between_train_steps": 0.005352020263671875, "step": 724}
+{"info/global_step": 725, "train_info/time_within_train_step": 27.326956033706665, "step": 725}
+{"train_info/time_between_train_steps": 0.005349636077880859, "step": 725}
+{"info/global_step": 726, "train_info/time_within_train_step": 27.536694765090942, "step": 726}
+{"train_info/time_between_train_steps": 0.005447864532470703, "step": 726}
+{"info/global_step": 727, "train_info/time_within_train_step": 27.378309726715088, "step": 727}
+{"train_info/time_between_train_steps": 0.005433797836303711, "step": 727}
+{"info/global_step": 728, "train_info/time_within_train_step": 27.35113549232483, "step": 728}
+{"train_info/time_between_train_steps": 0.005570888519287109, "step": 728}
+{"info/global_step": 729, "train_info/time_within_train_step": 27.564569234848022, "step": 729}
+{"train_info/time_between_train_steps": 0.010181427001953125, "step": 729}
+{"train_info/time_between_train_steps": 12.918586015701294, "step": 729}
+{"info/global_step": 730, "train_info/time_within_train_step": 27.402984619140625, "step": 730}
+{"train_info/time_between_train_steps": 0.005796670913696289, "step": 730}
+{"info/global_step": 731, "train_info/time_within_train_step": 27.585786819458008, "step": 731}
+{"train_info/time_between_train_steps": 0.005591630935668945, "step": 731}
+{"info/global_step": 732, "train_info/time_within_train_step": 27.42587685585022, "step": 732}
+{"train_info/time_between_train_steps": 0.009938955307006836, "step": 732}
+{"info/global_step": 733, "train_info/time_within_train_step": 27.47019910812378, "step": 733}
+{"train_info/time_between_train_steps": 0.005994319915771484, "step": 733}
+{"info/global_step": 734, "train_info/time_within_train_step": 27.389458417892456, "step": 734}
+{"train_info/time_between_train_steps": 0.0059967041015625, "step": 734}
+{"info/global_step": 735, "train_info/time_within_train_step": 27.470496892929077, "step": 735}
+{"train_info/time_between_train_steps": 0.005533695220947266, "step": 735}
+{"info/global_step": 736, "train_info/time_within_train_step": 27.258125066757202, "step": 736}
+{"train_info/time_between_train_steps": 0.005747795104980469, "step": 736}
+{"info/global_step": 737, "train_info/time_within_train_step": 27.55409526824951, "step": 737}
+{"train_info/time_between_train_steps": 0.005193948745727539, "step": 737}
+{"info/global_step": 738, "train_info/time_within_train_step": 27.419282913208008, "step": 738}
+{"train_info/time_between_train_steps": 0.005364894866943359, "step": 738}
+{"info/global_step": 739, "train_info/time_within_train_step": 27.482577562332153, "step": 739}
+{"train_info/time_between_train_steps": 0.008713722229003906, "step": 739}
+{"info/global_step": 740, "train_info/time_within_train_step": 27.40595269203186, "step": 740}
+{"train_info/time_between_train_steps": 0.01485300064086914, "step": 740}
+{"info/global_step": 741, "train_info/time_within_train_step": 27.495439767837524, "step": 741}
+{"train_info/time_between_train_steps": 0.00756525993347168, "step": 741}
+{"info/global_step": 742, "train_info/time_within_train_step": 27.41529417037964, "step": 742}
+{"train_info/time_between_train_steps": 0.008656740188598633, "step": 742}
+{"info/global_step": 743, "train_info/time_within_train_step": 27.308621406555176, "step": 743}
+{"train_info/time_between_train_steps": 0.005484819412231445, "step": 743}
+{"info/global_step": 744, "train_info/time_within_train_step": 27.29612898826599, "step": 744}
+{"train_info/time_between_train_steps": 0.005994081497192383, "step": 744}
+{"info/global_step": 745, "train_info/time_within_train_step": 27.210073232650757, "step": 745}
+{"train_info/time_between_train_steps": 0.009123563766479492, "step": 745}
+{"info/global_step": 746, "train_info/time_within_train_step": 27.390238523483276, "step": 746}
+{"train_info/time_between_train_steps": 0.005212306976318359, "step": 746}
+{"info/global_step": 747, "train_info/time_within_train_step": 27.204756498336792, "step": 747}
+{"train_info/time_between_train_steps": 0.005354404449462891, "step": 747}
+{"info/global_step": 748, "train_info/time_within_train_step": 27.220250129699707, "step": 748}
+{"train_info/time_between_train_steps": 0.005240678787231445, "step": 748}
+{"info/global_step": 749, "train_info/time_within_train_step": 27.37521529197693, "step": 749}
+{"train_info/time_between_train_steps": 0.005205392837524414, "step": 749}
+{"info/global_step": 750, "train_info/time_within_train_step": 27.44611930847168, "step": 750}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734033465, "_runtime": 23916}, "step": 750}
+{"logs": {"train/loss": 3.805, "train/learning_rate": 0.00025, "train/epoch": 27.02, "_timestamp": 1734033465, "_runtime": 23916}, "step": 750}
+{"train_info/time_between_train_steps": 0.012108802795410156, "step": 750}
+{"info/global_step": 751, "train_info/time_within_train_step": 27.417672872543335, "step": 751}
+{"train_info/time_between_train_steps": 0.005381345748901367, "step": 751}
+{"info/global_step": 752, "train_info/time_within_train_step": 27.35316014289856, "step": 752}
+{"train_info/time_between_train_steps": 0.005438566207885742, "step": 752}
+{"info/global_step": 753, "train_info/time_within_train_step": 27.406396865844727, "step": 753}
+{"train_info/time_between_train_steps": 0.0054094791412353516, "step": 753}
+{"info/global_step": 754, "train_info/time_within_train_step": 27.319916009902954, "step": 754}
+{"train_info/time_between_train_steps": 0.0058956146240234375, "step": 754}
+{"info/global_step": 755, "train_info/time_within_train_step": 27.38036799430847, "step": 755}
+{"train_info/time_between_train_steps": 0.011771202087402344, "step": 755}
+{"info/global_step": 756, "train_info/time_within_train_step": 27.394099473953247, "step": 756}
+{"train_info/time_between_train_steps": 0.005845069885253906, "step": 756}
+{"train_info/time_between_train_steps": 12.94377064704895, "step": 756}
+{"info/global_step": 757, "train_info/time_within_train_step": 27.35245442390442, "step": 757}
+{"train_info/time_between_train_steps": 0.005077362060546875, "step": 757}
+{"info/global_step": 758, "train_info/time_within_train_step": 27.650957107543945, "step": 758}
+{"train_info/time_between_train_steps": 0.007019758224487305, "step": 758}
+{"info/global_step": 759, "train_info/time_within_train_step": 27.439465522766113, "step": 759}
+{"train_info/time_between_train_steps": 0.005345344543457031, "step": 759}
+{"info/global_step": 760, "train_info/time_within_train_step": 27.51729440689087, "step": 760}
+{"train_info/time_between_train_steps": 0.005250453948974609, "step": 760}
+{"info/global_step": 761, "train_info/time_within_train_step": 27.419551849365234, "step": 761}
+{"train_info/time_between_train_steps": 0.005662202835083008, "step": 761}
+{"info/global_step": 762, "train_info/time_within_train_step": 27.55822730064392, "step": 762}
+{"train_info/time_between_train_steps": 0.0059244632720947266, "step": 762}
+{"info/global_step": 763, "train_info/time_within_train_step": 27.51456308364868, "step": 763}
+{"train_info/time_between_train_steps": 0.0058062076568603516, "step": 763}
+{"info/global_step": 764, "train_info/time_within_train_step": 27.47923755645752, "step": 764}
+{"train_info/time_between_train_steps": 0.005214214324951172, "step": 764}
+{"info/global_step": 765, "train_info/time_within_train_step": 27.382482051849365, "step": 765}
+{"train_info/time_between_train_steps": 0.009030342102050781, "step": 765}
+{"info/global_step": 766, "train_info/time_within_train_step": 27.462873220443726, "step": 766}
+{"train_info/time_between_train_steps": 0.014693975448608398, "step": 766}
+{"info/global_step": 767, "train_info/time_within_train_step": 27.341803312301636, "step": 767}
+{"train_info/time_between_train_steps": 0.0053408145904541016, "step": 767}
+{"info/global_step": 768, "train_info/time_within_train_step": 27.22782325744629, "step": 768}
+{"train_info/time_between_train_steps": 0.005219459533691406, "step": 768}
+{"info/global_step": 769, "train_info/time_within_train_step": 27.241849422454834, "step": 769}
+{"train_info/time_between_train_steps": 0.007349491119384766, "step": 769}
+{"info/global_step": 770, "train_info/time_within_train_step": 27.246644973754883, "step": 770}
+{"train_info/time_between_train_steps": 0.014935970306396484, "step": 770}
+{"info/global_step": 771, "train_info/time_within_train_step": 27.480518341064453, "step": 771}
+{"train_info/time_between_train_steps": 0.005727052688598633, "step": 771}
+{"info/global_step": 772, "train_info/time_within_train_step": 27.378459692001343, "step": 772}
+{"train_info/time_between_train_steps": 0.005298614501953125, "step": 772}
+{"info/global_step": 773, "train_info/time_within_train_step": 27.210317850112915, "step": 773}
+{"train_info/time_between_train_steps": 0.005280971527099609, "step": 773}
+{"info/global_step": 774, "train_info/time_within_train_step": 27.274213314056396, "step": 774}
+{"train_info/time_between_train_steps": 0.005278348922729492, "step": 774}
+{"info/global_step": 775, "train_info/time_within_train_step": 27.4232759475708, "step": 775}
+{"train_info/time_between_train_steps": 0.0053997039794921875, "step": 775}
+{"info/global_step": 776, "train_info/time_within_train_step": 27.252904415130615, "step": 776}
+{"train_info/time_between_train_steps": 0.005232334136962891, "step": 776}
+{"info/global_step": 777, "train_info/time_within_train_step": 27.232807397842407, "step": 777}
+{"train_info/time_between_train_steps": 0.005320310592651367, "step": 777}
+{"info/global_step": 778, "train_info/time_within_train_step": 27.247549057006836, "step": 778}
+{"train_info/time_between_train_steps": 0.006442070007324219, "step": 778}
+{"info/global_step": 779, "train_info/time_within_train_step": 27.393256425857544, "step": 779}
+{"train_info/time_between_train_steps": 0.005465984344482422, "step": 779}
+{"info/global_step": 780, "train_info/time_within_train_step": 27.37333607673645, "step": 780}
+{"train_info/time_between_train_steps": 0.015623331069946289, "step": 780}
+{"info/global_step": 781, "train_info/time_within_train_step": 27.370466232299805, "step": 781}
+{"train_info/time_between_train_steps": 0.010662078857421875, "step": 781}
+{"info/global_step": 782, "train_info/time_within_train_step": 27.272751808166504, "step": 782}
+{"train_info/time_between_train_steps": 0.015439033508300781, "step": 782}
+{"info/global_step": 783, "train_info/time_within_train_step": 27.46968173980713, "step": 783}
+{"train_info/time_between_train_steps": 0.012353897094726562, "step": 783}
+{"train_info/time_between_train_steps": 12.824958801269531, "step": 783}
+{"info/global_step": 784, "train_info/time_within_train_step": 27.362204790115356, "step": 784}
+{"train_info/time_between_train_steps": 0.0057141780853271484, "step": 784}
+{"info/global_step": 785, "train_info/time_within_train_step": 27.400785207748413, "step": 785}
+{"train_info/time_between_train_steps": 0.00558781623840332, "step": 785}
+{"info/global_step": 786, "train_info/time_within_train_step": 27.276623010635376, "step": 786}
+{"train_info/time_between_train_steps": 0.005867481231689453, "step": 786}
+{"info/global_step": 787, "train_info/time_within_train_step": 27.507734060287476, "step": 787}
+{"train_info/time_between_train_steps": 0.005631446838378906, "step": 787}
+{"info/global_step": 788, "train_info/time_within_train_step": 27.245386838912964, "step": 788}
+{"train_info/time_between_train_steps": 0.005962848663330078, "step": 788}
+{"info/global_step": 789, "train_info/time_within_train_step": 27.4439218044281, "step": 789}
+{"train_info/time_between_train_steps": 0.005785226821899414, "step": 789}
+{"info/global_step": 790, "train_info/time_within_train_step": 27.350322008132935, "step": 790}
+{"train_info/time_between_train_steps": 0.005746603012084961, "step": 790}
+{"info/global_step": 791, "train_info/time_within_train_step": 27.3874351978302, "step": 791}
+{"train_info/time_between_train_steps": 0.0052373409271240234, "step": 791}
+{"info/global_step": 792, "train_info/time_within_train_step": 27.248844623565674, "step": 792}
+{"train_info/time_between_train_steps": 0.011767148971557617, "step": 792}
+{"info/global_step": 793, "train_info/time_within_train_step": 27.295939922332764, "step": 793}
+{"train_info/time_between_train_steps": 0.005305767059326172, "step": 793}
+{"info/global_step": 794, "train_info/time_within_train_step": 27.267273664474487, "step": 794}
+{"train_info/time_between_train_steps": 0.005299806594848633, "step": 794}
+{"info/global_step": 795, "train_info/time_within_train_step": 27.20364284515381, "step": 795}
+{"train_info/time_between_train_steps": 0.005251407623291016, "step": 795}
+{"info/global_step": 796, "train_info/time_within_train_step": 27.202466011047363, "step": 796}
+{"train_info/time_between_train_steps": 0.005244016647338867, "step": 796}
+{"info/global_step": 797, "train_info/time_within_train_step": 27.272371530532837, "step": 797}
+{"train_info/time_between_train_steps": 0.005372285842895508, "step": 797}
+{"info/global_step": 798, "train_info/time_within_train_step": 27.2555570602417, "step": 798}
+{"train_info/time_between_train_steps": 0.010603666305541992, "step": 798}
+{"info/global_step": 799, "train_info/time_within_train_step": 27.6742103099823, "step": 799}
+{"train_info/time_between_train_steps": 0.005322694778442383, "step": 799}
+{"info/global_step": 800, "train_info/time_within_train_step": 27.445500373840332, "step": 800}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734034860, "_runtime": 25311}, "step": 800}
+{"logs": {"train/loss": 3.726, "train/learning_rate": 0.00022222222222222218, "train/epoch": 29.01, "_timestamp": 1734034860, "_runtime": 25311}, "step": 800}
+{"train_info/time_between_train_steps": 3.028412342071533, "step": 800}
+{"info/global_step": 801, "train_info/time_within_train_step": 27.348297119140625, "step": 801}
+{"train_info/time_between_train_steps": 0.016206979751586914, "step": 801}
+{"info/global_step": 802, "train_info/time_within_train_step": 27.25723958015442, "step": 802}
+{"train_info/time_between_train_steps": 0.010007143020629883, "step": 802}
+{"info/global_step": 803, "train_info/time_within_train_step": 27.406988620758057, "step": 803}
+{"train_info/time_between_train_steps": 0.010395526885986328, "step": 803}
+{"info/global_step": 804, "train_info/time_within_train_step": 27.430042505264282, "step": 804}
+{"train_info/time_between_train_steps": 0.01178598403930664, "step": 804}
+{"info/global_step": 805, "train_info/time_within_train_step": 27.48417568206787, "step": 805}
+{"train_info/time_between_train_steps": 0.005291938781738281, "step": 805}
+{"info/global_step": 806, "train_info/time_within_train_step": 27.303809642791748, "step": 806}
+{"train_info/time_between_train_steps": 0.00555729866027832, "step": 806}
+{"info/global_step": 807, "train_info/time_within_train_step": 27.352675199508667, "step": 807}
+{"train_info/time_between_train_steps": 0.005440711975097656, "step": 807}
+{"info/global_step": 808, "train_info/time_within_train_step": 27.372366428375244, "step": 808}
+{"train_info/time_between_train_steps": 0.010373353958129883, "step": 808}
+{"info/global_step": 809, "train_info/time_within_train_step": 27.294907808303833, "step": 809}
+{"train_info/time_between_train_steps": 0.010624885559082031, "step": 809}
+{"info/global_step": 810, "train_info/time_within_train_step": 27.35106921195984, "step": 810}
+{"train_info/time_between_train_steps": 0.006235837936401367, "step": 810}
+{"train_info/time_between_train_steps": 12.817037105560303, "step": 810}
+{"info/global_step": 811, "train_info/time_within_train_step": 27.296112298965454, "step": 811}
+{"train_info/time_between_train_steps": 0.009522438049316406, "step": 811}
+{"info/global_step": 812, "train_info/time_within_train_step": 27.606049299240112, "step": 812}
+{"train_info/time_between_train_steps": 0.011993646621704102, "step": 812}
+{"info/global_step": 813, "train_info/time_within_train_step": 27.301023721694946, "step": 813}
+{"train_info/time_between_train_steps": 0.00544285774230957, "step": 813}
+{"info/global_step": 814, "train_info/time_within_train_step": 27.447033405303955, "step": 814}
+{"train_info/time_between_train_steps": 0.010808944702148438, "step": 814}
+{"info/global_step": 815, "train_info/time_within_train_step": 27.344911336898804, "step": 815}
+{"train_info/time_between_train_steps": 0.005511283874511719, "step": 815}
+{"info/global_step": 816, "train_info/time_within_train_step": 27.476519346237183, "step": 816}
+{"train_info/time_between_train_steps": 0.005287647247314453, "step": 816}
+{"info/global_step": 817, "train_info/time_within_train_step": 27.317163944244385, "step": 817}
+{"train_info/time_between_train_steps": 0.005681514739990234, "step": 817}
+{"info/global_step": 818, "train_info/time_within_train_step": 27.561129808425903, "step": 818}
+{"train_info/time_between_train_steps": 0.01153707504272461, "step": 818}
+{"info/global_step": 819, "train_info/time_within_train_step": 27.342931747436523, "step": 819}
+{"train_info/time_between_train_steps": 0.005120277404785156, "step": 819}
+{"info/global_step": 820, "train_info/time_within_train_step": 27.396070957183838, "step": 820}
+{"train_info/time_between_train_steps": 0.009307384490966797, "step": 820}
+{"info/global_step": 821, "train_info/time_within_train_step": 27.218457221984863, "step": 821}
+{"train_info/time_between_train_steps": 0.005281209945678711, "step": 821}
+{"info/global_step": 822, "train_info/time_within_train_step": 27.25928258895874, "step": 822}
+{"train_info/time_between_train_steps": 0.010068416595458984, "step": 822}
+{"info/global_step": 823, "train_info/time_within_train_step": 27.410852432250977, "step": 823}
+{"train_info/time_between_train_steps": 0.008281707763671875, "step": 823}
+{"info/global_step": 824, "train_info/time_within_train_step": 27.26513648033142, "step": 824}
+{"train_info/time_between_train_steps": 0.0052700042724609375, "step": 824}
+{"info/global_step": 825, "train_info/time_within_train_step": 27.27999758720398, "step": 825}
+{"train_info/time_between_train_steps": 0.005312204360961914, "step": 825}
+{"info/global_step": 826, "train_info/time_within_train_step": 27.41432547569275, "step": 826}
+{"train_info/time_between_train_steps": 0.005291938781738281, "step": 826}
+{"info/global_step": 827, "train_info/time_within_train_step": 27.318915367126465, "step": 827}
+{"train_info/time_between_train_steps": 0.005217790603637695, "step": 827}
+{"info/global_step": 828, "train_info/time_within_train_step": 27.253827333450317, "step": 828}
+{"train_info/time_between_train_steps": 0.010209083557128906, "step": 828}
+{"info/global_step": 829, "train_info/time_within_train_step": 27.323243856430054, "step": 829}
+{"train_info/time_between_train_steps": 0.005367279052734375, "step": 829}
+{"info/global_step": 830, "train_info/time_within_train_step": 27.31266140937805, "step": 830}
+{"train_info/time_between_train_steps": 0.005451202392578125, "step": 830}
+{"info/global_step": 831, "train_info/time_within_train_step": 27.330641746520996, "step": 831}
+{"train_info/time_between_train_steps": 0.0055658817291259766, "step": 831}
+{"info/global_step": 832, "train_info/time_within_train_step": 27.273571968078613, "step": 832}
+{"train_info/time_between_train_steps": 0.00540614128112793, "step": 832}
+{"info/global_step": 833, "train_info/time_within_train_step": 27.474451541900635, "step": 833}
+{"train_info/time_between_train_steps": 0.005401134490966797, "step": 833}
+{"info/global_step": 834, "train_info/time_within_train_step": 27.29665470123291, "step": 834}
+{"train_info/time_between_train_steps": 0.005502223968505859, "step": 834}
+{"info/global_step": 835, "train_info/time_within_train_step": 27.293922185897827, "step": 835}
+{"train_info/time_between_train_steps": 0.005615949630737305, "step": 835}
+{"info/global_step": 836, "train_info/time_within_train_step": 27.27749514579773, "step": 836}
+{"train_info/time_between_train_steps": 0.0055735111236572266, "step": 836}
+{"info/global_step": 837, "train_info/time_within_train_step": 27.30457305908203, "step": 837}
+{"train_info/time_between_train_steps": 0.00632476806640625, "step": 837}
+{"train_info/time_between_train_steps": 12.965120792388916, "step": 837}
+{"info/global_step": 838, "train_info/time_within_train_step": 27.266828536987305, "step": 838}
+{"train_info/time_between_train_steps": 0.010934829711914062, "step": 838}
+{"info/global_step": 839, "train_info/time_within_train_step": 27.453619956970215, "step": 839}
+{"train_info/time_between_train_steps": 0.010590553283691406, "step": 839}
+{"info/global_step": 840, "train_info/time_within_train_step": 27.275349378585815, "step": 840}
+{"train_info/time_between_train_steps": 0.010614633560180664, "step": 840}
+{"info/global_step": 841, "train_info/time_within_train_step": 27.47219228744507, "step": 841}
+{"train_info/time_between_train_steps": 0.0054552555084228516, "step": 841}
+{"info/global_step": 842, "train_info/time_within_train_step": 27.25087881088257, "step": 842}
+{"train_info/time_between_train_steps": 0.0054242610931396484, "step": 842}
+{"info/global_step": 843, "train_info/time_within_train_step": 27.392587184906006, "step": 843}
+{"train_info/time_between_train_steps": 0.005600690841674805, "step": 843}
+{"info/global_step": 844, "train_info/time_within_train_step": 27.22268772125244, "step": 844}
+{"train_info/time_between_train_steps": 0.005658864974975586, "step": 844}
+{"info/global_step": 845, "train_info/time_within_train_step": 27.386623859405518, "step": 845}
+{"train_info/time_between_train_steps": 0.005041837692260742, "step": 845}
+{"info/global_step": 846, "train_info/time_within_train_step": 27.288535594940186, "step": 846}
+{"train_info/time_between_train_steps": 0.00868368148803711, "step": 846}
+{"info/global_step": 847, "train_info/time_within_train_step": 27.28698468208313, "step": 847}
+{"train_info/time_between_train_steps": 0.005117177963256836, "step": 847}
+{"info/global_step": 848, "train_info/time_within_train_step": 27.25102710723877, "step": 848}
+{"train_info/time_between_train_steps": 0.005267620086669922, "step": 848}
+{"info/global_step": 849, "train_info/time_within_train_step": 27.298950672149658, "step": 849}
+{"train_info/time_between_train_steps": 0.01091146469116211, "step": 849}
+{"info/global_step": 850, "train_info/time_within_train_step": 27.322795152664185, "step": 850}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734036256, "_runtime": 26707}, "step": 850}
+{"logs": {"train/loss": 3.6562, "train/learning_rate": 0.00019444444444444443, "train/epoch": 31.01, "_timestamp": 1734036256, "_runtime": 26707}, "step": 850}
+{"train_info/time_between_train_steps": 0.009222745895385742, "step": 850}
+{"info/global_step": 851, "train_info/time_within_train_step": 27.321372747421265, "step": 851}
+{"train_info/time_between_train_steps": 0.0050394535064697266, "step": 851}
+{"info/global_step": 852, "train_info/time_within_train_step": 27.493122339248657, "step": 852}
+{"train_info/time_between_train_steps": 0.009876728057861328, "step": 852}
+{"info/global_step": 853, "train_info/time_within_train_step": 27.410497903823853, "step": 853}
+{"train_info/time_between_train_steps": 0.011096715927124023, "step": 853}
+{"info/global_step": 854, "train_info/time_within_train_step": 27.313435077667236, "step": 854}
+{"train_info/time_between_train_steps": 0.012814521789550781, "step": 854}
+{"info/global_step": 855, "train_info/time_within_train_step": 27.38489818572998, "step": 855}
+{"train_info/time_between_train_steps": 0.012445688247680664, "step": 855}
+{"info/global_step": 856, "train_info/time_within_train_step": 27.37485933303833, "step": 856}
+{"train_info/time_between_train_steps": 0.00980687141418457, "step": 856}
+{"info/global_step": 857, "train_info/time_within_train_step": 27.46886420249939, "step": 857}
+{"train_info/time_between_train_steps": 0.009824275970458984, "step": 857}
+{"info/global_step": 858, "train_info/time_within_train_step": 27.464054107666016, "step": 858}
+{"train_info/time_between_train_steps": 0.008603811264038086, "step": 858}
+{"info/global_step": 859, "train_info/time_within_train_step": 27.30604863166809, "step": 859}
+{"train_info/time_between_train_steps": 0.010388612747192383, "step": 859}
+{"info/global_step": 860, "train_info/time_within_train_step": 27.491275787353516, "step": 860}
+{"train_info/time_between_train_steps": 0.0053141117095947266, "step": 860}
+{"info/global_step": 861, "train_info/time_within_train_step": 27.469119548797607, "step": 861}
+{"train_info/time_between_train_steps": 0.011352062225341797, "step": 861}
+{"info/global_step": 862, "train_info/time_within_train_step": 27.396422147750854, "step": 862}
+{"train_info/time_between_train_steps": 0.010184049606323242, "step": 862}
+{"info/global_step": 863, "train_info/time_within_train_step": 27.442481756210327, "step": 863}
+{"train_info/time_between_train_steps": 0.011969804763793945, "step": 863}
+{"info/global_step": 864, "train_info/time_within_train_step": 27.562684297561646, "step": 864}
+{"train_info/time_between_train_steps": 0.010909795761108398, "step": 864}
+{"train_info/time_between_train_steps": 13.275043249130249, "step": 864}
+{"info/global_step": 865, "train_info/time_within_train_step": 27.465948820114136, "step": 865}
+{"train_info/time_between_train_steps": 0.021054506301879883, "step": 865}
+{"info/global_step": 866, "train_info/time_within_train_step": 27.621692657470703, "step": 866}
+{"train_info/time_between_train_steps": 0.008841991424560547, "step": 866}
+{"info/global_step": 867, "train_info/time_within_train_step": 27.37183976173401, "step": 867}
+{"train_info/time_between_train_steps": 0.0055027008056640625, "step": 867}
+{"info/global_step": 868, "train_info/time_within_train_step": 27.553333044052124, "step": 868}
+{"train_info/time_between_train_steps": 0.011001348495483398, "step": 868}
+{"info/global_step": 869, "train_info/time_within_train_step": 51.920953035354614, "step": 869}
+{"train_info/time_between_train_steps": 0.015885353088378906, "step": 869}
+{"info/global_step": 870, "train_info/time_within_train_step": 62.65101337432861, "step": 870}
+{"train_info/time_between_train_steps": 0.008043050765991211, "step": 870}
+{"info/global_step": 871, "train_info/time_within_train_step": 37.6160032749176, "step": 871}
+{"train_info/time_between_train_steps": 0.011846780776977539, "step": 871}
+{"info/global_step": 872, "train_info/time_within_train_step": 63.39317321777344, "step": 872}
+{"train_info/time_between_train_steps": 0.01500844955444336, "step": 872}
+{"info/global_step": 873, "train_info/time_within_train_step": 62.27837824821472, "step": 873}
+{"train_info/time_between_train_steps": 0.0055370330810546875, "step": 873}
+{"info/global_step": 874, "train_info/time_within_train_step": 62.05388331413269, "step": 874}
+{"train_info/time_between_train_steps": 0.006337404251098633, "step": 874}
+{"info/global_step": 875, "train_info/time_within_train_step": 39.596402645111084, "step": 875}
+{"train_info/time_between_train_steps": 0.005392789840698242, "step": 875}
+{"info/global_step": 876, "train_info/time_within_train_step": 63.18334889411926, "step": 876}
+{"train_info/time_between_train_steps": 0.005428314208984375, "step": 876}
+{"info/global_step": 877, "train_info/time_within_train_step": 62.2247953414917, "step": 877}
+{"train_info/time_between_train_steps": 0.015035629272460938, "step": 877}
+{"info/global_step": 878, "train_info/time_within_train_step": 62.887035608291626, "step": 878}
+{"train_info/time_between_train_steps": 0.007304668426513672, "step": 878}
+{"info/global_step": 879, "train_info/time_within_train_step": 62.497037172317505, "step": 879}
+{"train_info/time_between_train_steps": 0.005354642868041992, "step": 879}
+{"info/global_step": 880, "train_info/time_within_train_step": 61.82414412498474, "step": 880}
+{"train_info/time_between_train_steps": 0.010999441146850586, "step": 880}
+{"info/global_step": 881, "train_info/time_within_train_step": 41.82953453063965, "step": 881}
+{"train_info/time_between_train_steps": 0.005425930023193359, "step": 881}
+{"info/global_step": 882, "train_info/time_within_train_step": 63.02771329879761, "step": 882}
+{"train_info/time_between_train_steps": 0.009737968444824219, "step": 882}
+{"info/global_step": 883, "train_info/time_within_train_step": 39.43227767944336, "step": 883}
+{"train_info/time_between_train_steps": 0.007391452789306641, "step": 883}
+{"info/global_step": 884, "train_info/time_within_train_step": 63.26054930686951, "step": 884}
+{"train_info/time_between_train_steps": 0.0054018497467041016, "step": 884}
+{"info/global_step": 885, "train_info/time_within_train_step": 50.48312950134277, "step": 885}
+{"train_info/time_between_train_steps": 0.005446910858154297, "step": 885}
+{"info/global_step": 886, "train_info/time_within_train_step": 50.60595226287842, "step": 886}
+{"train_info/time_between_train_steps": 0.015273094177246094, "step": 886}
+{"info/global_step": 887, "train_info/time_within_train_step": 62.69363522529602, "step": 887}
+{"train_info/time_between_train_steps": 0.009340286254882812, "step": 887}
+{"info/global_step": 888, "train_info/time_within_train_step": 30.237990617752075, "step": 888}
+{"train_info/time_between_train_steps": 0.008881568908691406, "step": 888}
+{"info/global_step": 889, "train_info/time_within_train_step": 28.14458155632019, "step": 889}
+{"train_info/time_between_train_steps": 0.011001825332641602, "step": 889}
+{"info/global_step": 890, "train_info/time_within_train_step": 56.87730526924133, "step": 890}
+{"train_info/time_between_train_steps": 0.005558013916015625, "step": 890}
+{"info/global_step": 891, "train_info/time_within_train_step": 62.39712166786194, "step": 891}
+{"train_info/time_between_train_steps": 0.010301351547241211, "step": 891}
+{"train_info/time_between_train_steps": 25.625715970993042, "step": 891}
+{"info/global_step": 892, "train_info/time_within_train_step": 62.86077165603638, "step": 892}
+{"train_info/time_between_train_steps": 0.019148826599121094, "step": 892}
+{"info/global_step": 893, "train_info/time_within_train_step": 63.31811189651489, "step": 893}
+{"train_info/time_between_train_steps": 0.01279759407043457, "step": 893}
+{"info/global_step": 894, "train_info/time_within_train_step": 56.527711153030396, "step": 894}
+{"train_info/time_between_train_steps": 0.005804300308227539, "step": 894}
+{"info/global_step": 895, "train_info/time_within_train_step": 44.74512267112732, "step": 895}
+{"train_info/time_between_train_steps": 0.011543989181518555, "step": 895}
+{"info/global_step": 896, "train_info/time_within_train_step": 63.030937910079956, "step": 896}
+{"train_info/time_between_train_steps": 0.01079416275024414, "step": 896}
+{"info/global_step": 897, "train_info/time_within_train_step": 63.07753109931946, "step": 897}
+{"train_info/time_between_train_steps": 0.006327390670776367, "step": 897}
+{"info/global_step": 898, "train_info/time_within_train_step": 62.33686971664429, "step": 898}
+{"train_info/time_between_train_steps": 0.007956743240356445, "step": 898}
+{"info/global_step": 899, "train_info/time_within_train_step": 63.28612160682678, "step": 899}
+{"train_info/time_between_train_steps": 0.0057299137115478516, "step": 899}
+{"info/global_step": 900, "train_info/time_within_train_step": 62.322924852371216, "step": 900}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734038572, "_runtime": 29023}, "step": 900}
+{"logs": {"train/loss": 3.5922, "train/learning_rate": 0.00016666666666666666, "train/epoch": 33.01, "_timestamp": 1734038572, "_runtime": 29023}, "step": 900}
+{"train_info/time_between_train_steps": 3.061556100845337, "step": 900}
+{"info/global_step": 901, "train_info/time_within_train_step": 62.97682571411133, "step": 901}
+{"train_info/time_between_train_steps": 0.013787269592285156, "step": 901}
+{"info/global_step": 902, "train_info/time_within_train_step": 62.71828842163086, "step": 902}
+{"train_info/time_between_train_steps": 0.011031866073608398, "step": 902}
+{"info/global_step": 903, "train_info/time_within_train_step": 62.40031862258911, "step": 903}
+{"train_info/time_between_train_steps": 0.013602018356323242, "step": 903}
+{"info/global_step": 904, "train_info/time_within_train_step": 63.39562726020813, "step": 904}
+{"train_info/time_between_train_steps": 0.005532264709472656, "step": 904}
+{"info/global_step": 905, "train_info/time_within_train_step": 62.47146272659302, "step": 905}
+{"train_info/time_between_train_steps": 0.00786900520324707, "step": 905}
+{"info/global_step": 906, "train_info/time_within_train_step": 63.406954288482666, "step": 906}
+{"train_info/time_between_train_steps": 0.011324405670166016, "step": 906}
+{"info/global_step": 907, "train_info/time_within_train_step": 62.477434396743774, "step": 907}
+{"train_info/time_between_train_steps": 0.0071718692779541016, "step": 907}
+{"info/global_step": 908, "train_info/time_within_train_step": 62.728760957717896, "step": 908}
+{"train_info/time_between_train_steps": 0.010489702224731445, "step": 908}
+{"info/global_step": 909, "train_info/time_within_train_step": 63.0454535484314, "step": 909}
+{"train_info/time_between_train_steps": 0.009793996810913086, "step": 909}
+{"info/global_step": 910, "train_info/time_within_train_step": 62.425610065460205, "step": 910}
+{"train_info/time_between_train_steps": 0.009982585906982422, "step": 910}
+{"info/global_step": 911, "train_info/time_within_train_step": 53.41399121284485, "step": 911}
+{"train_info/time_between_train_steps": 0.010485410690307617, "step": 911}
+{"info/global_step": 912, "train_info/time_within_train_step": 61.32019805908203, "step": 912}
+{"train_info/time_between_train_steps": 0.010230541229248047, "step": 912}
+{"info/global_step": 913, "train_info/time_within_train_step": 61.18531560897827, "step": 913}
+{"train_info/time_between_train_steps": 0.007651329040527344, "step": 913}
+{"info/global_step": 914, "train_info/time_within_train_step": 28.19020915031433, "step": 914}
+{"train_info/time_between_train_steps": 0.00841069221496582, "step": 914}
+{"info/global_step": 915, "train_info/time_within_train_step": 27.40661120414734, "step": 915}
+{"train_info/time_between_train_steps": 0.012045860290527344, "step": 915}
+{"info/global_step": 916, "train_info/time_within_train_step": 27.328063488006592, "step": 916}
+{"train_info/time_between_train_steps": 0.005514621734619141, "step": 916}
+{"info/global_step": 917, "train_info/time_within_train_step": 47.039403200149536, "step": 917}
+{"train_info/time_between_train_steps": 0.012252569198608398, "step": 917}
+{"info/global_step": 918, "train_info/time_within_train_step": 62.882707595825195, "step": 918}
+{"train_info/time_between_train_steps": 0.010978221893310547, "step": 918}
+{"train_info/time_between_train_steps": 18.703782558441162, "step": 918}
+{"info/global_step": 919, "train_info/time_within_train_step": 47.69794583320618, "step": 919}
+{"train_info/time_between_train_steps": 0.015057563781738281, "step": 919}
+{"info/global_step": 920, "train_info/time_within_train_step": 63.06863713264465, "step": 920}
+{"train_info/time_between_train_steps": 0.009346723556518555, "step": 920}
+{"info/global_step": 921, "train_info/time_within_train_step": 63.08142924308777, "step": 921}
+{"train_info/time_between_train_steps": 0.005860567092895508, "step": 921}
+{"info/global_step": 922, "train_info/time_within_train_step": 62.64836883544922, "step": 922}
+{"train_info/time_between_train_steps": 0.0054895877838134766, "step": 922}
+{"info/global_step": 923, "train_info/time_within_train_step": 63.35441255569458, "step": 923}
+{"train_info/time_between_train_steps": 0.010227203369140625, "step": 923}
+{"info/global_step": 924, "train_info/time_within_train_step": 62.62896418571472, "step": 924}
+{"train_info/time_between_train_steps": 0.009935379028320312, "step": 924}
+{"info/global_step": 925, "train_info/time_within_train_step": 62.97026968002319, "step": 925}
+{"train_info/time_between_train_steps": 0.011758089065551758, "step": 925}
+{"info/global_step": 926, "train_info/time_within_train_step": 37.4369592666626, "step": 926}
+{"train_info/time_between_train_steps": 0.005534648895263672, "step": 926}
+{"info/global_step": 927, "train_info/time_within_train_step": 63.18580389022827, "step": 927}
+{"train_info/time_between_train_steps": 0.00539398193359375, "step": 927}
+{"info/global_step": 928, "train_info/time_within_train_step": 62.264772176742554, "step": 928}
+{"train_info/time_between_train_steps": 0.005322694778442383, "step": 928}
+{"info/global_step": 929, "train_info/time_within_train_step": 63.21875357627869, "step": 929}
+{"train_info/time_between_train_steps": 0.005396604537963867, "step": 929}
+{"info/global_step": 930, "train_info/time_within_train_step": 40.896652698516846, "step": 930}
+{"train_info/time_between_train_steps": 0.0051479339599609375, "step": 930}
+{"info/global_step": 931, "train_info/time_within_train_step": 59.394148111343384, "step": 931}
+{"train_info/time_between_train_steps": 0.008271455764770508, "step": 931}
+{"info/global_step": 932, "train_info/time_within_train_step": 62.34831666946411, "step": 932}
+{"train_info/time_between_train_steps": 0.005458354949951172, "step": 932}
+{"info/global_step": 933, "train_info/time_within_train_step": 63.17858099937439, "step": 933}
+{"train_info/time_between_train_steps": 0.0058176517486572266, "step": 933}
+{"info/global_step": 934, "train_info/time_within_train_step": 62.28332734107971, "step": 934}
+{"train_info/time_between_train_steps": 0.015312671661376953, "step": 934}
+{"info/global_step": 935, "train_info/time_within_train_step": 62.82632541656494, "step": 935}
+{"train_info/time_between_train_steps": 0.007242918014526367, "step": 935}
+{"info/global_step": 936, "train_info/time_within_train_step": 62.49331831932068, "step": 936}
+{"train_info/time_between_train_steps": 0.0055201053619384766, "step": 936}
+{"info/global_step": 937, "train_info/time_within_train_step": 62.30316472053528, "step": 937}
+{"train_info/time_between_train_steps": 0.00527644157409668, "step": 937}
+{"info/global_step": 938, "train_info/time_within_train_step": 63.36114001274109, "step": 938}
+{"train_info/time_between_train_steps": 0.010723352432250977, "step": 938}
+{"info/global_step": 939, "train_info/time_within_train_step": 62.376769065856934, "step": 939}
+{"train_info/time_between_train_steps": 0.007627248764038086, "step": 939}
+{"info/global_step": 940, "train_info/time_within_train_step": 63.28385853767395, "step": 940}
+{"train_info/time_between_train_steps": 0.0077800750732421875, "step": 940}
+{"info/global_step": 941, "train_info/time_within_train_step": 64.257080078125, "step": 941}
+{"train_info/time_between_train_steps": 0.00582575798034668, "step": 941}
+{"info/global_step": 942, "train_info/time_within_train_step": 62.58527636528015, "step": 942}
+{"train_info/time_between_train_steps": 0.011867761611938477, "step": 942}
+{"info/global_step": 943, "train_info/time_within_train_step": 63.21392583847046, "step": 943}
+{"train_info/time_between_train_steps": 0.013350486755371094, "step": 943}
+{"info/global_step": 944, "train_info/time_within_train_step": 62.453465938568115, "step": 944}
+{"train_info/time_between_train_steps": 0.008393287658691406, "step": 944}
+{"info/global_step": 945, "train_info/time_within_train_step": 63.3970685005188, "step": 945}
+{"train_info/time_between_train_steps": 0.006480693817138672, "step": 945}
+{"train_info/time_between_train_steps": 26.55343246459961, "step": 945}
+{"info/global_step": 946, "train_info/time_within_train_step": 63.38732099533081, "step": 946}
+{"train_info/time_between_train_steps": 0.00795435905456543, "step": 946}
+{"info/global_step": 947, "train_info/time_within_train_step": 62.63818907737732, "step": 947}
+{"train_info/time_between_train_steps": 0.005800008773803711, "step": 947}
+{"info/global_step": 948, "train_info/time_within_train_step": 63.22262167930603, "step": 948}
+{"train_info/time_between_train_steps": 0.008102655410766602, "step": 948}
+{"info/global_step": 949, "train_info/time_within_train_step": 62.7906539440155, "step": 949}
+{"train_info/time_between_train_steps": 0.007899284362792969, "step": 949}
+{"info/global_step": 950, "train_info/time_within_train_step": 62.54882049560547, "step": 950}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734041564, "_runtime": 32015}, "step": 950}
+{"logs": {"train/loss": 3.5355, "train/learning_rate": 0.0001388888888888889, "train/epoch": 35.0, "_timestamp": 1734041564, "_runtime": 32015}, "step": 950}
+{"train_info/time_between_train_steps": 0.013857364654541016, "step": 950}
+{"info/global_step": 951, "train_info/time_within_train_step": 63.429176330566406, "step": 951}
+{"train_info/time_between_train_steps": 0.0056915283203125, "step": 951}
+{"info/global_step": 952, "train_info/time_within_train_step": 62.39622616767883, "step": 952}
+{"train_info/time_between_train_steps": 0.005803108215332031, "step": 952}
+{"info/global_step": 953, "train_info/time_within_train_step": 63.29966187477112, "step": 953}
+{"train_info/time_between_train_steps": 0.010384321212768555, "step": 953}
+{"info/global_step": 954, "train_info/time_within_train_step": 61.322842836380005, "step": 954}
+{"train_info/time_between_train_steps": 0.00647282600402832, "step": 954}
+{"info/global_step": 955, "train_info/time_within_train_step": 54.79644155502319, "step": 955}
+{"train_info/time_between_train_steps": 0.00787973403930664, "step": 955}
+{"info/global_step": 956, "train_info/time_within_train_step": 60.923102617263794, "step": 956}
+{"train_info/time_between_train_steps": 0.007504940032958984, "step": 956}
+{"info/global_step": 957, "train_info/time_within_train_step": 48.944650173187256, "step": 957}
+{"train_info/time_between_train_steps": 0.00538182258605957, "step": 957}
+{"info/global_step": 958, "train_info/time_within_train_step": 27.41001868247986, "step": 958}
+{"train_info/time_between_train_steps": 0.010329723358154297, "step": 958}
+{"info/global_step": 959, "train_info/time_within_train_step": 27.407601594924927, "step": 959}
+{"train_info/time_between_train_steps": 0.005594730377197266, "step": 959}
+{"info/global_step": 960, "train_info/time_within_train_step": 27.27329993247986, "step": 960}
+{"train_info/time_between_train_steps": 0.005371570587158203, "step": 960}
+{"info/global_step": 961, "train_info/time_within_train_step": 58.25912618637085, "step": 961}
+{"train_info/time_between_train_steps": 0.011734724044799805, "step": 961}
+{"info/global_step": 962, "train_info/time_within_train_step": 57.274351835250854, "step": 962}
+{"train_info/time_between_train_steps": 0.009536504745483398, "step": 962}
+{"info/global_step": 963, "train_info/time_within_train_step": 42.33876061439514, "step": 963}
+{"train_info/time_between_train_steps": 0.005704641342163086, "step": 963}
+{"info/global_step": 964, "train_info/time_within_train_step": 63.04236912727356, "step": 964}
+{"train_info/time_between_train_steps": 0.012363433837890625, "step": 964}
+{"info/global_step": 965, "train_info/time_within_train_step": 62.53076696395874, "step": 965}
+{"train_info/time_between_train_steps": 0.0055103302001953125, "step": 965}
+{"info/global_step": 966, "train_info/time_within_train_step": 62.37832260131836, "step": 966}
+{"train_info/time_between_train_steps": 0.007625102996826172, "step": 966}
+{"info/global_step": 967, "train_info/time_within_train_step": 63.11237382888794, "step": 967}
+{"train_info/time_between_train_steps": 0.00915670394897461, "step": 967}
+{"info/global_step": 968, "train_info/time_within_train_step": 52.604639291763306, "step": 968}
+{"train_info/time_between_train_steps": 0.00596928596496582, "step": 968}
+{"info/global_step": 969, "train_info/time_within_train_step": 47.97503161430359, "step": 969}
+{"train_info/time_between_train_steps": 0.005521297454833984, "step": 969}
+{"info/global_step": 970, "train_info/time_within_train_step": 62.73930740356445, "step": 970}
+{"train_info/time_between_train_steps": 0.010477542877197266, "step": 970}
+{"info/global_step": 971, "train_info/time_within_train_step": 62.97542095184326, "step": 971}
+{"train_info/time_between_train_steps": 0.005883455276489258, "step": 971}
+{"info/global_step": 972, "train_info/time_within_train_step": 63.92523789405823, "step": 972}
+{"train_info/time_between_train_steps": 0.012866735458374023, "step": 972}
+{"train_info/time_between_train_steps": 26.318257808685303, "step": 972}
+{"info/global_step": 973, "train_info/time_within_train_step": 62.53993082046509, "step": 973}
+{"train_info/time_between_train_steps": 0.016017675399780273, "step": 973}
+{"info/global_step": 974, "train_info/time_within_train_step": 36.433876276016235, "step": 974}
+{"train_info/time_between_train_steps": 0.01091909408569336, "step": 974}
+{"info/global_step": 975, "train_info/time_within_train_step": 63.146461725234985, "step": 975}
+{"train_info/time_between_train_steps": 0.006495475769042969, "step": 975}
+{"info/global_step": 976, "train_info/time_within_train_step": 62.68953347206116, "step": 976}
+{"train_info/time_between_train_steps": 0.006734371185302734, "step": 976}
+{"info/global_step": 977, "train_info/time_within_train_step": 62.712300062179565, "step": 977}
+{"train_info/time_between_train_steps": 0.008623838424682617, "step": 977}
+{"info/global_step": 978, "train_info/time_within_train_step": 63.12292981147766, "step": 978}
+{"train_info/time_between_train_steps": 0.012515783309936523, "step": 978}
+{"info/global_step": 979, "train_info/time_within_train_step": 62.37418985366821, "step": 979}
+{"train_info/time_between_train_steps": 0.006260871887207031, "step": 979}
+{"info/global_step": 980, "train_info/time_within_train_step": 63.33837842941284, "step": 980}
+{"train_info/time_between_train_steps": 0.008112192153930664, "step": 980}
+{"info/global_step": 981, "train_info/time_within_train_step": 62.335718393325806, "step": 981}
+{"train_info/time_between_train_steps": 0.005654096603393555, "step": 981}
+{"info/global_step": 982, "train_info/time_within_train_step": 63.109174489974976, "step": 982}
+{"train_info/time_between_train_steps": 0.005567073822021484, "step": 982}
+{"info/global_step": 983, "train_info/time_within_train_step": 62.50404238700867, "step": 983}
+{"train_info/time_between_train_steps": 0.005404472351074219, "step": 983}
+{"info/global_step": 984, "train_info/time_within_train_step": 62.452019453048706, "step": 984}
+{"train_info/time_between_train_steps": 0.005867719650268555, "step": 984}
+{"info/global_step": 985, "train_info/time_within_train_step": 63.243908405303955, "step": 985}
+{"train_info/time_between_train_steps": 0.005873680114746094, "step": 985}
+{"info/global_step": 986, "train_info/time_within_train_step": 53.083576917648315, "step": 986}
+{"train_info/time_between_train_steps": 0.005470752716064453, "step": 986}
+{"info/global_step": 987, "train_info/time_within_train_step": 44.80816984176636, "step": 987}
+{"train_info/time_between_train_steps": 0.00574493408203125, "step": 987}
+{"info/global_step": 988, "train_info/time_within_train_step": 64.53012466430664, "step": 988}
+{"train_info/time_between_train_steps": 0.007409334182739258, "step": 988}
+{"info/global_step": 989, "train_info/time_within_train_step": 63.047619342803955, "step": 989}
+{"train_info/time_between_train_steps": 0.01578044891357422, "step": 989}
+{"info/global_step": 990, "train_info/time_within_train_step": 62.432838439941406, "step": 990}
+{"train_info/time_between_train_steps": 0.01806330680847168, "step": 990}
+{"info/global_step": 991, "train_info/time_within_train_step": 63.29695177078247, "step": 991}
+{"train_info/time_between_train_steps": 0.00551295280456543, "step": 991}
+{"info/global_step": 992, "train_info/time_within_train_step": 62.38561940193176, "step": 992}
+{"train_info/time_between_train_steps": 0.011825799942016602, "step": 992}
+{"info/global_step": 993, "train_info/time_within_train_step": 63.0623083114624, "step": 993}
+{"train_info/time_between_train_steps": 0.012602090835571289, "step": 993}
+{"info/global_step": 994, "train_info/time_within_train_step": 62.71667551994324, "step": 994}
+{"train_info/time_between_train_steps": 0.006068229675292969, "step": 994}
+{"info/global_step": 995, "train_info/time_within_train_step": 62.435439109802246, "step": 995}
+{"train_info/time_between_train_steps": 0.010195255279541016, "step": 995}
+{"info/global_step": 996, "train_info/time_within_train_step": 63.346940994262695, "step": 996}
+{"train_info/time_between_train_steps": 0.007619380950927734, "step": 996}
+{"info/global_step": 997, "train_info/time_within_train_step": 62.391470193862915, "step": 997}
+{"train_info/time_between_train_steps": 0.0056133270263671875, "step": 997}
+{"info/global_step": 998, "train_info/time_within_train_step": 63.41678237915039, "step": 998}
+{"train_info/time_between_train_steps": 0.00680088996887207, "step": 998}
+{"info/global_step": 999, "train_info/time_within_train_step": 60.38480305671692, "step": 999}
+{"train_info/time_between_train_steps": 0.011990070343017578, "step": 999}
+{"train_info/time_between_train_steps": 19.85333228111267, "step": 999}
+{"info/global_step": 1000, "train_info/time_within_train_step": 61.14815425872803, "step": 1000}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 22626.0, "train_info/memory_max_reserved": 22626.0, "_timestamp": 1734044510, "_runtime": 34961}, "step": 1000}
+{"logs": {"train/loss": 3.4861, "train/learning_rate": 0.00011111111111111109, "train/epoch": 37.0, "_timestamp": 1734044510, "_runtime": 34961}, "step": 1000}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 12166.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734044515, "_runtime": 34966}, "step": 1000}
+{"logs": {"eval/loss": 4.404828071594238, "eval/runtime": 4.6328, "eval/samples_per_second": 22.017, "eval/steps_per_second": 1.511, "train/epoch": 37.0, "_timestamp": 1734044515, "_runtime": 34966}, "step": 1000}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.560546875, "train_info/memory_reserved": 12166.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734044515, "_runtime": 34966}, "step": 1000}
+{"logs": {"eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_loss": 4.404828071594238, "eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_ppl": 81.84507014102485, "eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_runtime": 4.6328, "eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_samples_per_second": 22.017, "train/epoch": 37.0, "_timestamp": 1734044515, "_runtime": 34966}, "step": 1000}
+{"train_info/time_between_train_steps": 8.32508659362793, "step": 1000}
+{"info/global_step": 1001, "train_info/time_within_train_step": 57.52022886276245, "step": 1001}
+{"train_info/time_between_train_steps": 0.005450010299682617, "step": 1001}
+{"info/global_step": 1002, "train_info/time_within_train_step": 27.30656862258911, "step": 1002}
+{"train_info/time_between_train_steps": 0.005770444869995117, "step": 1002}
+{"info/global_step": 1003, "train_info/time_within_train_step": 27.641233205795288, "step": 1003}
+{"train_info/time_between_train_steps": 0.005473613739013672, "step": 1003}
+{"info/global_step": 1004, "train_info/time_within_train_step": 27.26523756980896, "step": 1004}
+{"train_info/time_between_train_steps": 0.005833625793457031, "step": 1004}
+{"info/global_step": 1005, "train_info/time_within_train_step": 54.876375913619995, "step": 1005}
+{"train_info/time_between_train_steps": 0.01534581184387207, "step": 1005}
+{"info/global_step": 1006, "train_info/time_within_train_step": 60.74109506607056, "step": 1006}
+{"train_info/time_between_train_steps": 0.005869150161743164, "step": 1006}
+{"info/global_step": 1007, "train_info/time_within_train_step": 42.134511947631836, "step": 1007}
+{"train_info/time_between_train_steps": 0.016797542572021484, "step": 1007}
+{"info/global_step": 1008, "train_info/time_within_train_step": 63.16669321060181, "step": 1008}
+{"train_info/time_between_train_steps": 0.005442142486572266, "step": 1008}
+{"info/global_step": 1009, "train_info/time_within_train_step": 62.58721590042114, "step": 1009}
+{"train_info/time_between_train_steps": 0.0053386688232421875, "step": 1009}
+{"info/global_step": 1010, "train_info/time_within_train_step": 62.47753071784973, "step": 1010}
+{"train_info/time_between_train_steps": 0.009888648986816406, "step": 1010}
+{"info/global_step": 1011, "train_info/time_within_train_step": 63.380860328674316, "step": 1011}
+{"train_info/time_between_train_steps": 0.005367279052734375, "step": 1011}
+{"info/global_step": 1012, "train_info/time_within_train_step": 62.3993353843689, "step": 1012}
+{"train_info/time_between_train_steps": 0.00525212287902832, "step": 1012}
+{"info/global_step": 1013, "train_info/time_within_train_step": 63.472524642944336, "step": 1013}
+{"train_info/time_between_train_steps": 0.010436296463012695, "step": 1013}
+{"info/global_step": 1014, "train_info/time_within_train_step": 42.85775804519653, "step": 1014}
+{"train_info/time_between_train_steps": 0.013994693756103516, "step": 1014}
+{"info/global_step": 1015, "train_info/time_within_train_step": 58.842888832092285, "step": 1015}
+{"train_info/time_between_train_steps": 0.005355358123779297, "step": 1015}
+{"info/global_step": 1016, "train_info/time_within_train_step": 62.47709631919861, "step": 1016}
+{"train_info/time_between_train_steps": 0.011743545532226562, "step": 1016}
+{"info/global_step": 1017, "train_info/time_within_train_step": 63.39271426200867, "step": 1017}
+{"train_info/time_between_train_steps": 0.005342245101928711, "step": 1017}
+{"info/global_step": 1018, "train_info/time_within_train_step": 64.04088759422302, "step": 1018}
+{"train_info/time_between_train_steps": 0.005738258361816406, "step": 1018}
+{"info/global_step": 1019, "train_info/time_within_train_step": 62.86748957633972, "step": 1019}
+{"train_info/time_between_train_steps": 0.007455110549926758, "step": 1019}
+{"info/global_step": 1020, "train_info/time_within_train_step": 62.868178844451904, "step": 1020}
+{"train_info/time_between_train_steps": 0.005434274673461914, "step": 1020}
+{"info/global_step": 1021, "train_info/time_within_train_step": 62.470056772232056, "step": 1021}
+{"train_info/time_between_train_steps": 0.007446765899658203, "step": 1021}
+{"info/global_step": 1022, "train_info/time_within_train_step": 63.382142066955566, "step": 1022}
+{"train_info/time_between_train_steps": 0.015613317489624023, "step": 1022}
+{"info/global_step": 1023, "train_info/time_within_train_step": 62.485761880874634, "step": 1023}
+{"train_info/time_between_train_steps": 0.011734962463378906, "step": 1023}
+{"info/global_step": 1024, "train_info/time_within_train_step": 63.36758756637573, "step": 1024}
+{"train_info/time_between_train_steps": 0.009512901306152344, "step": 1024}
+{"info/global_step": 1025, "train_info/time_within_train_step": 62.68414545059204, "step": 1025}
+{"train_info/time_between_train_steps": 0.00947260856628418, "step": 1025}
+{"info/global_step": 1026, "train_info/time_within_train_step": 62.746702432632446, "step": 1026}
+{"train_info/time_between_train_steps": 0.006804227828979492, "step": 1026}
+{"train_info/time_between_train_steps": 26.13748860359192, "step": 1026}
+{"info/global_step": 1027, "train_info/time_within_train_step": 63.167189836502075, "step": 1027}
+{"train_info/time_between_train_steps": 0.009219884872436523, "step": 1027}
+{"info/global_step": 1028, "train_info/time_within_train_step": 63.05434489250183, "step": 1028}
+{"train_info/time_between_train_steps": 0.005529642105102539, "step": 1028}
+{"info/global_step": 1029, "train_info/time_within_train_step": 62.59279227256775, "step": 1029}
+{"train_info/time_between_train_steps": 0.010693550109863281, "step": 1029}
+{"info/global_step": 1030, "train_info/time_within_train_step": 63.69941544532776, "step": 1030}
+{"train_info/time_between_train_steps": 0.0076639652252197266, "step": 1030}
+{"info/global_step": 1031, "train_info/time_within_train_step": 62.55499053001404, "step": 1031}
+{"train_info/time_between_train_steps": 0.00996708869934082, "step": 1031}
+{"info/global_step": 1032, "train_info/time_within_train_step": 63.58674383163452, "step": 1032}
+{"train_info/time_between_train_steps": 0.009474515914916992, "step": 1032}
+{"info/global_step": 1033, "train_info/time_within_train_step": 62.58392572402954, "step": 1033}
+{"train_info/time_between_train_steps": 0.0058820247650146484, "step": 1033}
+{"info/global_step": 1034, "train_info/time_within_train_step": 64.38459157943726, "step": 1034}
+{"train_info/time_between_train_steps": 0.005439043045043945, "step": 1034}
+{"info/global_step": 1035, "train_info/time_within_train_step": 63.348588705062866, "step": 1035}
+{"train_info/time_between_train_steps": 0.009347677230834961, "step": 1035}
+{"info/global_step": 1036, "train_info/time_within_train_step": 62.49212861061096, "step": 1036}
+{"train_info/time_between_train_steps": 0.007324695587158203, "step": 1036}
+{"info/global_step": 1037, "train_info/time_within_train_step": 63.54776048660278, "step": 1037}
+{"train_info/time_between_train_steps": 0.005568742752075195, "step": 1037}
+{"info/global_step": 1038, "train_info/time_within_train_step": 62.546159982681274, "step": 1038}
+{"train_info/time_between_train_steps": 0.00706171989440918, "step": 1038}
+{"info/global_step": 1039, "train_info/time_within_train_step": 62.92313241958618, "step": 1039}
+{"train_info/time_between_train_steps": 0.0055789947509765625, "step": 1039}
+{"info/global_step": 1040, "train_info/time_within_train_step": 63.162773847579956, "step": 1040}
+{"train_info/time_between_train_steps": 0.010270833969116211, "step": 1040}
+{"info/global_step": 1041, "train_info/time_within_train_step": 62.577064752578735, "step": 1041}
+{"train_info/time_between_train_steps": 0.005548715591430664, "step": 1041}
+{"info/global_step": 1042, "train_info/time_within_train_step": 55.23290991783142, "step": 1042}
+{"train_info/time_between_train_steps": 0.007314920425415039, "step": 1042}
+{"info/global_step": 1043, "train_info/time_within_train_step": 61.121458768844604, "step": 1043}
+{"train_info/time_between_train_steps": 0.006883382797241211, "step": 1043}
+{"info/global_step": 1044, "train_info/time_within_train_step": 61.24798655509949, "step": 1044}
+{"train_info/time_between_train_steps": 0.005870819091796875, "step": 1044}
+{"info/global_step": 1045, "train_info/time_within_train_step": 31.989116668701172, "step": 1045}
+{"train_info/time_between_train_steps": 0.007679462432861328, "step": 1045}
+{"info/global_step": 1046, "train_info/time_within_train_step": 27.39557981491089, "step": 1046}
+{"train_info/time_between_train_steps": 0.006125211715698242, "step": 1046}
+{"info/global_step": 1047, "train_info/time_within_train_step": 27.511625051498413, "step": 1047}
+{"train_info/time_between_train_steps": 0.010148763656616211, "step": 1047}
+{"info/global_step": 1048, "train_info/time_within_train_step": 29.313949584960938, "step": 1048}
+{"train_info/time_between_train_steps": 0.005853176116943359, "step": 1048}
+{"info/global_step": 1049, "train_info/time_within_train_step": 64.99209761619568, "step": 1049}
+{"train_info/time_between_train_steps": 0.005868434906005859, "step": 1049}
+{"info/global_step": 1050, "train_info/time_within_train_step": 48.7931694984436, "step": 1050}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734047368, "_runtime": 37819}, "step": 1050}
+{"logs": {"train/loss": 3.4188, "train/learning_rate": 8.333333333333333e-05, "train/epoch": 38.02, "_timestamp": 1734047368, "_runtime": 37819}, "step": 1050}
+{"train_info/time_between_train_steps": 0.013603687286376953, "step": 1050}
+{"info/global_step": 1051, "train_info/time_within_train_step": 33.48234248161316, "step": 1051}
+{"train_info/time_between_train_steps": 0.00608062744140625, "step": 1051}
+{"info/global_step": 1052, "train_info/time_within_train_step": 63.38808584213257, "step": 1052}
+{"train_info/time_between_train_steps": 0.010888099670410156, "step": 1052}
+{"info/global_step": 1053, "train_info/time_within_train_step": 46.051310777664185, "step": 1053}
+{"train_info/time_between_train_steps": 0.0076906681060791016, "step": 1053}
+{"train_info/time_between_train_steps": 18.53874659538269, "step": 1053}
+{"info/global_step": 1054, "train_info/time_within_train_step": 63.50963759422302, "step": 1054}
+{"train_info/time_between_train_steps": 0.007642984390258789, "step": 1054}
+{"info/global_step": 1055, "train_info/time_within_train_step": 42.443602561950684, "step": 1055}
+{"train_info/time_between_train_steps": 0.01128840446472168, "step": 1055}
+{"info/global_step": 1056, "train_info/time_within_train_step": 58.41509532928467, "step": 1056}
+{"train_info/time_between_train_steps": 0.006020069122314453, "step": 1056}
+{"info/global_step": 1057, "train_info/time_within_train_step": 57.41062021255493, "step": 1057}
+{"train_info/time_between_train_steps": 0.006167411804199219, "step": 1057}
+{"info/global_step": 1058, "train_info/time_within_train_step": 44.2279098033905, "step": 1058}
+{"train_info/time_between_train_steps": 0.006049633026123047, "step": 1058}
+{"info/global_step": 1059, "train_info/time_within_train_step": 63.36138963699341, "step": 1059}
+{"train_info/time_between_train_steps": 0.015890121459960938, "step": 1059}
+{"info/global_step": 1060, "train_info/time_within_train_step": 62.83889389038086, "step": 1060}
+{"train_info/time_between_train_steps": 0.010824918746948242, "step": 1060}
+{"info/global_step": 1061, "train_info/time_within_train_step": 62.54509520530701, "step": 1061}
+{"train_info/time_between_train_steps": 0.0055234432220458984, "step": 1061}
+{"info/global_step": 1062, "train_info/time_within_train_step": 63.35772943496704, "step": 1062}
+{"train_info/time_between_train_steps": 0.0061647891998291016, "step": 1062}
+{"info/global_step": 1063, "train_info/time_within_train_step": 62.43323540687561, "step": 1063}
+{"train_info/time_between_train_steps": 0.018826723098754883, "step": 1063}
+{"info/global_step": 1064, "train_info/time_within_train_step": 63.333645820617676, "step": 1064}
+{"train_info/time_between_train_steps": 0.007447242736816406, "step": 1064}
+{"info/global_step": 1065, "train_info/time_within_train_step": 63.93350076675415, "step": 1065}
+{"train_info/time_between_train_steps": 0.008352279663085938, "step": 1065}
+{"info/global_step": 1066, "train_info/time_within_train_step": 62.59778046607971, "step": 1066}
+{"train_info/time_between_train_steps": 0.009826898574829102, "step": 1066}
+{"info/global_step": 1067, "train_info/time_within_train_step": 63.30595588684082, "step": 1067}
+{"train_info/time_between_train_steps": 0.008797407150268555, "step": 1067}
+{"info/global_step": 1068, "train_info/time_within_train_step": 54.022040367126465, "step": 1068}
+{"train_info/time_between_train_steps": 0.01074838638305664, "step": 1068}
+{"info/global_step": 1069, "train_info/time_within_train_step": 45.61050510406494, "step": 1069}
+{"train_info/time_between_train_steps": 0.005534172058105469, "step": 1069}
+{"info/global_step": 1070, "train_info/time_within_train_step": 62.89878726005554, "step": 1070}
+{"train_info/time_between_train_steps": 0.007684946060180664, "step": 1070}
+{"info/global_step": 1071, "train_info/time_within_train_step": 37.90140748023987, "step": 1071}
+{"train_info/time_between_train_steps": 0.011853694915771484, "step": 1071}
+{"info/global_step": 1072, "train_info/time_within_train_step": 63.34193778038025, "step": 1072}
+{"train_info/time_between_train_steps": 0.01040339469909668, "step": 1072}
+{"info/global_step": 1073, "train_info/time_within_train_step": 62.42655420303345, "step": 1073}
+{"train_info/time_between_train_steps": 0.0054531097412109375, "step": 1073}
+{"info/global_step": 1074, "train_info/time_within_train_step": 63.274948596954346, "step": 1074}
+{"train_info/time_between_train_steps": 0.010798215866088867, "step": 1074}
+{"info/global_step": 1075, "train_info/time_within_train_step": 62.68228554725647, "step": 1075}
+{"train_info/time_between_train_steps": 0.0056934356689453125, "step": 1075}
+{"info/global_step": 1076, "train_info/time_within_train_step": 62.53200817108154, "step": 1076}
+{"train_info/time_between_train_steps": 0.011182546615600586, "step": 1076}
+{"info/global_step": 1077, "train_info/time_within_train_step": 63.16183805465698, "step": 1077}
+{"train_info/time_between_train_steps": 0.005728006362915039, "step": 1077}
+{"info/global_step": 1078, "train_info/time_within_train_step": 62.45727181434631, "step": 1078}
+{"train_info/time_between_train_steps": 0.008066177368164062, "step": 1078}
+{"info/global_step": 1079, "train_info/time_within_train_step": 63.44180083274841, "step": 1079}
+{"train_info/time_between_train_steps": 0.005707263946533203, "step": 1079}
+{"info/global_step": 1080, "train_info/time_within_train_step": 64.11670756340027, "step": 1080}
+{"train_info/time_between_train_steps": 0.0067005157470703125, "step": 1080}
+{"train_info/time_between_train_steps": 25.95883870124817, "step": 1080}
+{"info/global_step": 1081, "train_info/time_within_train_step": 62.58531713485718, "step": 1081}
+{"train_info/time_between_train_steps": 0.011166810989379883, "step": 1081}
+{"info/global_step": 1082, "train_info/time_within_train_step": 63.5464346408844, "step": 1082}
+{"train_info/time_between_train_steps": 0.007837772369384766, "step": 1082}
+{"info/global_step": 1083, "train_info/time_within_train_step": 62.60946035385132, "step": 1083}
+{"train_info/time_between_train_steps": 0.0059778690338134766, "step": 1083}
+{"info/global_step": 1084, "train_info/time_within_train_step": 62.91906428337097, "step": 1084}
+{"train_info/time_between_train_steps": 0.01026463508605957, "step": 1084}
+{"info/global_step": 1085, "train_info/time_within_train_step": 63.14048933982849, "step": 1085}
+{"train_info/time_between_train_steps": 0.006773948669433594, "step": 1085}
+{"info/global_step": 1086, "train_info/time_within_train_step": 62.7990779876709, "step": 1086}
+{"train_info/time_between_train_steps": 0.005720376968383789, "step": 1086}
+{"info/global_step": 1087, "train_info/time_within_train_step": 63.45279049873352, "step": 1087}
+{"train_info/time_between_train_steps": 0.010535001754760742, "step": 1087}
+{"info/global_step": 1088, "train_info/time_within_train_step": 62.47519254684448, "step": 1088}
+{"train_info/time_between_train_steps": 0.005290985107421875, "step": 1088}
+{"info/global_step": 1089, "train_info/time_within_train_step": 53.05156135559082, "step": 1089}
+{"train_info/time_between_train_steps": 0.007432460784912109, "step": 1089}
+{"info/global_step": 1090, "train_info/time_within_train_step": 61.1239595413208, "step": 1090}
+{"train_info/time_between_train_steps": 0.016021251678466797, "step": 1090}
+{"info/global_step": 1091, "train_info/time_within_train_step": 53.86771893501282, "step": 1091}
+{"train_info/time_between_train_steps": 0.005453824996948242, "step": 1091}
+{"info/global_step": 1092, "train_info/time_within_train_step": 27.30121898651123, "step": 1092}
+{"train_info/time_between_train_steps": 0.022616147994995117, "step": 1092}
+{"info/global_step": 1093, "train_info/time_within_train_step": 27.363760232925415, "step": 1093}
+{"train_info/time_between_train_steps": 0.009465932846069336, "step": 1093}
+{"info/global_step": 1094, "train_info/time_within_train_step": 27.383758544921875, "step": 1094}
+{"train_info/time_between_train_steps": 0.0054035186767578125, "step": 1094}
+{"info/global_step": 1095, "train_info/time_within_train_step": 45.29165267944336, "step": 1095}
+{"train_info/time_between_train_steps": 0.00574040412902832, "step": 1095}
+{"info/global_step": 1096, "train_info/time_within_train_step": 62.80965781211853, "step": 1096}
+{"train_info/time_between_train_steps": 0.007388591766357422, "step": 1096}
+{"info/global_step": 1097, "train_info/time_within_train_step": 36.76812291145325, "step": 1097}
+{"train_info/time_between_train_steps": 0.011459589004516602, "step": 1097}
+{"info/global_step": 1098, "train_info/time_within_train_step": 63.19956374168396, "step": 1098}
+{"train_info/time_between_train_steps": 0.010163068771362305, "step": 1098}
+{"info/global_step": 1099, "train_info/time_within_train_step": 50.36128830909729, "step": 1099}
+{"train_info/time_between_train_steps": 0.010805368423461914, "step": 1099}
+{"info/global_step": 1100, "train_info/time_within_train_step": 50.919822692871094, "step": 1100}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734050221, "_runtime": 40672}, "step": 1100}
+{"logs": {"train/loss": 3.4056, "train/learning_rate": 5.5555555555555545e-05, "train/epoch": 40.02, "_timestamp": 1734050221, "_runtime": 40672}, "step": 1100}
+{"train_info/time_between_train_steps": 2.7325963973999023, "step": 1100}
+{"info/global_step": 1101, "train_info/time_within_train_step": 61.05610013008118, "step": 1101}
+{"train_info/time_between_train_steps": 0.005434751510620117, "step": 1101}
+{"info/global_step": 1102, "train_info/time_within_train_step": 39.775118350982666, "step": 1102}
+{"train_info/time_between_train_steps": 0.009587526321411133, "step": 1102}
+{"info/global_step": 1103, "train_info/time_within_train_step": 63.250348806381226, "step": 1103}
+{"train_info/time_between_train_steps": 0.00631403923034668, "step": 1103}
+{"info/global_step": 1104, "train_info/time_within_train_step": 40.04391670227051, "step": 1104}
+{"train_info/time_between_train_steps": 0.005766630172729492, "step": 1104}
+{"info/global_step": 1105, "train_info/time_within_train_step": 61.19002556800842, "step": 1105}
+{"train_info/time_between_train_steps": 0.010544776916503906, "step": 1105}
+{"info/global_step": 1106, "train_info/time_within_train_step": 54.38263463973999, "step": 1106}
+{"train_info/time_between_train_steps": 0.012079954147338867, "step": 1106}
+{"info/global_step": 1107, "train_info/time_within_train_step": 45.8213632106781, "step": 1107}
+{"train_info/time_between_train_steps": 0.006433725357055664, "step": 1107}
+{"train_info/time_between_train_steps": 26.791473627090454, "step": 1107}
+{"info/global_step": 1108, "train_info/time_within_train_step": 62.576902866363525, "step": 1108}
+{"train_info/time_between_train_steps": 0.009627580642700195, "step": 1108}
+{"info/global_step": 1109, "train_info/time_within_train_step": 63.66921019554138, "step": 1109}
+{"train_info/time_between_train_steps": 0.010806798934936523, "step": 1109}
+{"info/global_step": 1110, "train_info/time_within_train_step": 62.587557792663574, "step": 1110}
+{"train_info/time_between_train_steps": 0.010827064514160156, "step": 1110}
+{"info/global_step": 1111, "train_info/time_within_train_step": 64.37873601913452, "step": 1111}
+{"train_info/time_between_train_steps": 0.006058692932128906, "step": 1111}
+{"info/global_step": 1112, "train_info/time_within_train_step": 63.36618089675903, "step": 1112}
+{"train_info/time_between_train_steps": 0.00606083869934082, "step": 1112}
+{"info/global_step": 1113, "train_info/time_within_train_step": 62.95195460319519, "step": 1113}
+{"train_info/time_between_train_steps": 0.005652189254760742, "step": 1113}
+{"info/global_step": 1114, "train_info/time_within_train_step": 63.37675738334656, "step": 1114}
+{"train_info/time_between_train_steps": 0.006812334060668945, "step": 1114}
+{"info/global_step": 1115, "train_info/time_within_train_step": 62.57492232322693, "step": 1115}
+{"train_info/time_between_train_steps": 0.005342960357666016, "step": 1115}
+{"info/global_step": 1116, "train_info/time_within_train_step": 62.86858916282654, "step": 1116}
+{"train_info/time_between_train_steps": 0.010614633560180664, "step": 1116}
+{"info/global_step": 1117, "train_info/time_within_train_step": 62.897956132888794, "step": 1117}
+{"train_info/time_between_train_steps": 0.006696224212646484, "step": 1117}
+{"info/global_step": 1118, "train_info/time_within_train_step": 62.32660222053528, "step": 1118}
+{"train_info/time_between_train_steps": 0.009634733200073242, "step": 1118}
+{"info/global_step": 1119, "train_info/time_within_train_step": 63.51453733444214, "step": 1119}
+{"train_info/time_between_train_steps": 0.007210254669189453, "step": 1119}
+{"info/global_step": 1120, "train_info/time_within_train_step": 62.57282638549805, "step": 1120}
+{"train_info/time_between_train_steps": 0.012811660766601562, "step": 1120}
+{"info/global_step": 1121, "train_info/time_within_train_step": 63.28972315788269, "step": 1121}
+{"train_info/time_between_train_steps": 0.010916471481323242, "step": 1121}
+{"info/global_step": 1122, "train_info/time_within_train_step": 62.84155607223511, "step": 1122}
+{"train_info/time_between_train_steps": 0.009619712829589844, "step": 1122}
+{"info/global_step": 1123, "train_info/time_within_train_step": 62.510353803634644, "step": 1123}
+{"train_info/time_between_train_steps": 0.009441852569580078, "step": 1123}
+{"info/global_step": 1124, "train_info/time_within_train_step": 63.419209718704224, "step": 1124}
+{"train_info/time_between_train_steps": 0.005556821823120117, "step": 1124}
+{"info/global_step": 1125, "train_info/time_within_train_step": 62.43175387382507, "step": 1125}
+{"train_info/time_between_train_steps": 0.005846261978149414, "step": 1125}
+{"info/global_step": 1126, "train_info/time_within_train_step": 64.80286812782288, "step": 1126}
+{"train_info/time_between_train_steps": 0.005764007568359375, "step": 1126}
+{"info/global_step": 1127, "train_info/time_within_train_step": 62.51436710357666, "step": 1127}
+{"train_info/time_between_train_steps": 0.009662389755249023, "step": 1127}
+{"info/global_step": 1128, "train_info/time_within_train_step": 62.77400827407837, "step": 1128}
+{"train_info/time_between_train_steps": 0.020162582397460938, "step": 1128}
+{"info/global_step": 1129, "train_info/time_within_train_step": 63.3380925655365, "step": 1129}
+{"train_info/time_between_train_steps": 0.005714893341064453, "step": 1129}
+{"info/global_step": 1130, "train_info/time_within_train_step": 62.55476999282837, "step": 1130}
+{"train_info/time_between_train_steps": 0.009258270263671875, "step": 1130}
+{"info/global_step": 1131, "train_info/time_within_train_step": 63.550376415252686, "step": 1131}
+{"train_info/time_between_train_steps": 0.011609315872192383, "step": 1131}
+{"info/global_step": 1132, "train_info/time_within_train_step": 62.402552127838135, "step": 1132}
+{"train_info/time_between_train_steps": 0.006649971008300781, "step": 1132}
+{"info/global_step": 1133, "train_info/time_within_train_step": 63.03147840499878, "step": 1133}
+{"train_info/time_between_train_steps": 0.01110076904296875, "step": 1133}
+{"info/global_step": 1134, "train_info/time_within_train_step": 55.31583523750305, "step": 1134}
+{"train_info/time_between_train_steps": 0.011222362518310547, "step": 1134}
+{"train_info/time_between_train_steps": 22.863733291625977, "step": 1134}
+{"info/global_step": 1135, "train_info/time_within_train_step": 61.15949463844299, "step": 1135}
+{"train_info/time_between_train_steps": 0.015351295471191406, "step": 1135}
+{"info/global_step": 1136, "train_info/time_within_train_step": 61.51463222503662, "step": 1136}
+{"train_info/time_between_train_steps": 0.0067064762115478516, "step": 1136}
+{"info/global_step": 1137, "train_info/time_within_train_step": 27.429846048355103, "step": 1137}
+{"train_info/time_between_train_steps": 0.006400585174560547, "step": 1137}
+{"info/global_step": 1138, "train_info/time_within_train_step": 27.67456817626953, "step": 1138}
+{"train_info/time_between_train_steps": 0.005993843078613281, "step": 1138}
+{"info/global_step": 1139, "train_info/time_within_train_step": 27.436121463775635, "step": 1139}
+{"train_info/time_between_train_steps": 0.011172056198120117, "step": 1139}
+{"info/global_step": 1140, "train_info/time_within_train_step": 34.11421012878418, "step": 1140}
+{"train_info/time_between_train_steps": 0.006524085998535156, "step": 1140}
+{"info/global_step": 1141, "train_info/time_within_train_step": 63.43292593955994, "step": 1141}
+{"train_info/time_between_train_steps": 0.01677250862121582, "step": 1141}
+{"info/global_step": 1142, "train_info/time_within_train_step": 44.83742022514343, "step": 1142}
+{"train_info/time_between_train_steps": 0.008651256561279297, "step": 1142}
+{"info/global_step": 1143, "train_info/time_within_train_step": 55.86494016647339, "step": 1143}
+{"train_info/time_between_train_steps": 0.010784149169921875, "step": 1143}
+{"info/global_step": 1144, "train_info/time_within_train_step": 62.356070041656494, "step": 1144}
+{"train_info/time_between_train_steps": 0.010961771011352539, "step": 1144}
+{"info/global_step": 1145, "train_info/time_within_train_step": 63.309513568878174, "step": 1145}
+{"train_info/time_between_train_steps": 0.011400699615478516, "step": 1145}
+{"info/global_step": 1146, "train_info/time_within_train_step": 62.36331272125244, "step": 1146}
+{"train_info/time_between_train_steps": 0.005605220794677734, "step": 1146}
+{"info/global_step": 1147, "train_info/time_within_train_step": 63.120012521743774, "step": 1147}
+{"train_info/time_between_train_steps": 0.010553598403930664, "step": 1147}
+{"info/global_step": 1148, "train_info/time_within_train_step": 37.34061312675476, "step": 1148}
+{"train_info/time_between_train_steps": 0.0056340694427490234, "step": 1148}
+{"info/global_step": 1149, "train_info/time_within_train_step": 60.350738525390625, "step": 1149}
+{"train_info/time_between_train_steps": 0.019970417022705078, "step": 1149}
+{"info/global_step": 1150, "train_info/time_within_train_step": 62.45649218559265, "step": 1150}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734053148, "_runtime": 43599}, "step": 1150}
+{"logs": {"train/loss": 3.3751, "train/learning_rate": 2.7777777777777772e-05, "train/epoch": 42.01, "_timestamp": 1734053148, "_runtime": 43599}, "step": 1150}
+{"train_info/time_between_train_steps": 0.02438068389892578, "step": 1150}
+{"info/global_step": 1151, "train_info/time_within_train_step": 63.467451095581055, "step": 1151}
+{"train_info/time_between_train_steps": 0.015998363494873047, "step": 1151}
+{"info/global_step": 1152, "train_info/time_within_train_step": 43.18042850494385, "step": 1152}
+{"train_info/time_between_train_steps": 0.01005411148071289, "step": 1152}
+{"info/global_step": 1153, "train_info/time_within_train_step": 56.84199500083923, "step": 1153}
+{"train_info/time_between_train_steps": 0.007217884063720703, "step": 1153}
+{"info/global_step": 1154, "train_info/time_within_train_step": 62.47210502624512, "step": 1154}
+{"train_info/time_between_train_steps": 0.011588811874389648, "step": 1154}
+{"info/global_step": 1155, "train_info/time_within_train_step": 63.42383050918579, "step": 1155}
+{"train_info/time_between_train_steps": 0.006322383880615234, "step": 1155}
+{"info/global_step": 1156, "train_info/time_within_train_step": 62.447470903396606, "step": 1156}
+{"train_info/time_between_train_steps": 0.006862640380859375, "step": 1156}
+{"info/global_step": 1157, "train_info/time_within_train_step": 64.6243269443512, "step": 1157}
+{"train_info/time_between_train_steps": 0.012761116027832031, "step": 1157}
+{"info/global_step": 1158, "train_info/time_within_train_step": 62.93394064903259, "step": 1158}
+{"train_info/time_between_train_steps": 0.011074542999267578, "step": 1158}
+{"info/global_step": 1159, "train_info/time_within_train_step": 62.52000975608826, "step": 1159}
+{"train_info/time_between_train_steps": 0.0076389312744140625, "step": 1159}
+{"info/global_step": 1160, "train_info/time_within_train_step": 63.39960789680481, "step": 1160}
+{"train_info/time_between_train_steps": 0.006693363189697266, "step": 1160}
+{"info/global_step": 1161, "train_info/time_within_train_step": 62.411624908447266, "step": 1161}
+{"train_info/time_between_train_steps": 0.011851072311401367, "step": 1161}
+{"train_info/time_between_train_steps": 25.96849012374878, "step": 1161}
+{"info/global_step": 1162, "train_info/time_within_train_step": 62.49555993080139, "step": 1162}
+{"train_info/time_between_train_steps": 0.00648045539855957, "step": 1162}
+{"info/global_step": 1163, "train_info/time_within_train_step": 63.552978515625, "step": 1163}
+{"train_info/time_between_train_steps": 0.007097721099853516, "step": 1163}
+{"info/global_step": 1164, "train_info/time_within_train_step": 62.39278745651245, "step": 1164}
+{"train_info/time_between_train_steps": 0.007096767425537109, "step": 1164}
+{"info/global_step": 1165, "train_info/time_within_train_step": 62.97440814971924, "step": 1165}
+{"train_info/time_between_train_steps": 0.00566864013671875, "step": 1165}
+{"info/global_step": 1166, "train_info/time_within_train_step": 38.32840442657471, "step": 1166}
+{"train_info/time_between_train_steps": 0.00640869140625, "step": 1166}
+{"info/global_step": 1167, "train_info/time_within_train_step": 63.458611726760864, "step": 1167}
+{"train_info/time_between_train_steps": 0.005556583404541016, "step": 1167}
+{"info/global_step": 1168, "train_info/time_within_train_step": 62.33820080757141, "step": 1168}
+{"train_info/time_between_train_steps": 0.00589752197265625, "step": 1168}
+{"info/global_step": 1169, "train_info/time_within_train_step": 62.91913676261902, "step": 1169}
+{"train_info/time_between_train_steps": 0.008229494094848633, "step": 1169}
+{"info/global_step": 1170, "train_info/time_within_train_step": 62.81785869598389, "step": 1170}
+{"train_info/time_between_train_steps": 0.009745359420776367, "step": 1170}
+{"info/global_step": 1171, "train_info/time_within_train_step": 62.45976400375366, "step": 1171}
+{"train_info/time_between_train_steps": 0.005373239517211914, "step": 1171}
+{"info/global_step": 1172, "train_info/time_within_train_step": 63.276347637176514, "step": 1172}
+{"train_info/time_between_train_steps": 0.005557060241699219, "step": 1172}
+{"info/global_step": 1173, "train_info/time_within_train_step": 63.91732096672058, "step": 1173}
+{"train_info/time_between_train_steps": 0.005648374557495117, "step": 1173}
+{"info/global_step": 1174, "train_info/time_within_train_step": 63.1263747215271, "step": 1174}
+{"train_info/time_between_train_steps": 0.010015010833740234, "step": 1174}
+{"info/global_step": 1175, "train_info/time_within_train_step": 62.85387349128723, "step": 1175}
+{"train_info/time_between_train_steps": 0.0056993961334228516, "step": 1175}
+{"info/global_step": 1176, "train_info/time_within_train_step": 62.43407607078552, "step": 1176}
+{"train_info/time_between_train_steps": 0.005579471588134766, "step": 1176}
+{"info/global_step": 1177, "train_info/time_within_train_step": 54.12849712371826, "step": 1177}
+{"train_info/time_between_train_steps": 0.009720802307128906, "step": 1177}
+{"info/global_step": 1178, "train_info/time_within_train_step": 57.185861587524414, "step": 1178}
+{"train_info/time_between_train_steps": 0.0055446624755859375, "step": 1178}
+{"info/global_step": 1179, "train_info/time_within_train_step": 60.999852657318115, "step": 1179}
+{"train_info/time_between_train_steps": 0.011707544326782227, "step": 1179}
+{"info/global_step": 1180, "train_info/time_within_train_step": 46.46031141281128, "step": 1180}
+{"train_info/time_between_train_steps": 0.009036064147949219, "step": 1180}
+{"info/global_step": 1181, "train_info/time_within_train_step": 27.311551094055176, "step": 1181}
+{"train_info/time_between_train_steps": 0.010364770889282227, "step": 1181}
+{"info/global_step": 1182, "train_info/time_within_train_step": 27.32648992538452, "step": 1182}
+{"train_info/time_between_train_steps": 0.010413408279418945, "step": 1182}
+{"info/global_step": 1183, "train_info/time_within_train_step": 27.402886867523193, "step": 1183}
+{"train_info/time_between_train_steps": 0.006387472152709961, "step": 1183}
+{"info/global_step": 1184, "train_info/time_within_train_step": 63.054006814956665, "step": 1184}
+{"train_info/time_between_train_steps": 0.005467891693115234, "step": 1184}
+{"info/global_step": 1185, "train_info/time_within_train_step": 51.8393759727478, "step": 1185}
+{"train_info/time_between_train_steps": 0.005402088165283203, "step": 1185}
+{"info/global_step": 1186, "train_info/time_within_train_step": 49.28672409057617, "step": 1186}
+{"train_info/time_between_train_steps": 0.0056226253509521484, "step": 1186}
+{"info/global_step": 1187, "train_info/time_within_train_step": 62.54214644432068, "step": 1187}
+{"train_info/time_between_train_steps": 0.009933710098266602, "step": 1187}
+{"info/global_step": 1188, "train_info/time_within_train_step": 39.392176151275635, "step": 1188}
+{"train_info/time_between_train_steps": 0.007363796234130859, "step": 1188}
+{"train_info/time_between_train_steps": 26.06059980392456, "step": 1188}
+{"info/global_step": 1189, "train_info/time_within_train_step": 61.120763540267944, "step": 1189}
+{"train_info/time_between_train_steps": 0.011008262634277344, "step": 1189}
+{"info/global_step": 1190, "train_info/time_within_train_step": 38.63389277458191, "step": 1190}
+{"train_info/time_between_train_steps": 0.005581378936767578, "step": 1190}
+{"info/global_step": 1191, "train_info/time_within_train_step": 63.279454708099365, "step": 1191}
+{"train_info/time_between_train_steps": 0.01577925682067871, "step": 1191}
+{"info/global_step": 1192, "train_info/time_within_train_step": 62.5788300037384, "step": 1192}
+{"train_info/time_between_train_steps": 0.006010293960571289, "step": 1192}
+{"info/global_step": 1193, "train_info/time_within_train_step": 62.63861441612244, "step": 1193}
+{"train_info/time_between_train_steps": 0.005740165710449219, "step": 1193}
+{"info/global_step": 1194, "train_info/time_within_train_step": 63.282376766204834, "step": 1194}
+{"train_info/time_between_train_steps": 0.005972862243652344, "step": 1194}
+{"info/global_step": 1195, "train_info/time_within_train_step": 62.39777970314026, "step": 1195}
+{"train_info/time_between_train_steps": 0.005842447280883789, "step": 1195}
+{"info/global_step": 1196, "train_info/time_within_train_step": 63.19361710548401, "step": 1196}
+{"train_info/time_between_train_steps": 0.005181312561035156, "step": 1196}
+{"info/global_step": 1197, "train_info/time_within_train_step": 62.33691906929016, "step": 1197}
+{"train_info/time_between_train_steps": 0.0071697235107421875, "step": 1197}
+{"info/global_step": 1198, "train_info/time_within_train_step": 62.6214382648468, "step": 1198}
+{"train_info/time_between_train_steps": 0.015228033065795898, "step": 1198}
+{"info/global_step": 1199, "train_info/time_within_train_step": 62.85038089752197, "step": 1199}
+{"train_info/time_between_train_steps": 0.005597352981567383, "step": 1199}
+{"info/global_step": 1200, "train_info/time_within_train_step": 62.29661965370178, "step": 1200}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734056082, "_runtime": 46533}, "step": 1200}
+{"logs": {"train/loss": 3.3526, "train/learning_rate": 0.0, "train/epoch": 44.01, "_timestamp": 1734056082, "_runtime": 46533}, "step": 1200}
+{"train_info": {"train_info/memory_allocated": 1922.525390625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734056085, "_runtime": 46536}, "step": 1200}
+{"logs": {"train/train_runtime": 46536.6103, "train/train_samples_per_second": 13.203, "train/train_steps_per_second": 0.026, "train/total_flos": 3.2521451470848e+17, "train/train_loss": 4.4521270370483395, "train/epoch": 44.01, "_timestamp": 1734056085, "_runtime": 46536}, "step": 1200}
+{"train_info": {"train_info/memory_allocated": 1922.5244140625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734056091, "_runtime": 46542}, "step": 1200}
+{"logs": {"eval/loss": 4.425502300262451, "eval/runtime": 4.5484, "eval/samples_per_second": 22.426, "eval/steps_per_second": 1.539, "train/epoch": 44.01, "_timestamp": 1734056091, "_runtime": 46542}, "step": 1200}
+{"train_info": {"train_info/memory_allocated": 1922.5244140625, "train_info/memory_max_allocated": 20715.89990234375, "train_info/memory_reserved": 22500.0, "train_info/memory_max_reserved": 24198.0, "_timestamp": 1734056091, "_runtime": 46542}, "step": 1200}
+{"logs": {"eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_loss": 4.425502300262451, "eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_ppl": 83.5547662641636, "eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_runtime": 4.5484, "eval//local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py_samples_per_second": 22.426, "train/epoch": 44.01, "_timestamp": 1734056091, "_runtime": 46542}, "step": 1200}
diff --git a/pytorch_model.bin b/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..38ac9954f469fafdf639fb422b47d67f8a40dc5b
--- /dev/null
+++ b/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4e27e6700f2d4b2a7cc35e1bfec8f1cd9b7bd8425158df365f791e673267f38
+size 510424169
diff --git a/shuffle_control_de_DE_randinit_seed53.log b/shuffle_control_de_DE_randinit_seed53.log
new file mode 100755
index 0000000000000000000000000000000000000000..649caf014382a1fb0deb760ef89dfe36e7733e88
--- /dev/null
+++ b/shuffle_control_de_DE_randinit_seed53.log
@@ -0,0 +1,137 @@
+|=>> 12/10 [22:26:15] - mistral - INFO :: Starting Run: shuffle_control_de_DE_randinit_seed53...
+|=>> 12/10 [22:26:15] - mistral - INFO :: Setting Random Seed to 53!
+|=>> 12/10 [22:26:15] - mistral - INFO :: Building Tokenize and Initializing `gpt2-small` via AutoModel/AutoConfig...
+|=>> 12/10 [22:26:15] - mistral - INFO :: Using Configs For Model From: /local/xiulyang/mission-impossible-language-models/mistral/conf/models/gpt2-small-50266.json ...
+|=>> 12/10 [22:26:15] - mistral.models.auto - INFO :: Building Hugging Face GPT2Config from provided configs: {'activation_function': 'gelu_new', 'architectures': ['GPT2LMHeadModel'], 'attn_pdrop': 0.1, 'embd_pdrop': 0.1, 'eos_token_id': 50265, 'bos_token_id': 50265, 'initializer_range': 0.02, 'layer_norm_epsilon': 1e-05, 'model_type': 'gpt2', 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_inner': None, 'n_layer': 12, 'n_positions': 1024, 'reorder_and_upcast_attn': True, 'resid_pdrop': 0.1, 'scale_attn_by_inverse_layer_idx': True, 'scale_attn_weights': True, 'summary_activation': None, 'summary_first_dropout': 0.2, 'summary_proj_to_labels': True, 'summary_type': 'cls_index', 'summary_use_proj': True, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 1024}}, 'torch_dtype': 'float32', 'transformers_version': '4.35.2', 'use_cache': False, 'vocab_size': 50266} ...
+|=>> 12/10 [22:26:15] - mistral.models.auto - INFO :: Fetching Hugging Face [Fast] AutoTokenizer for Model: `gpt2`...
+|=>> 12/10 [22:26:15] - mistral.models.auto - INFO :: Using a Pretokenized Dataset
+|=>> 12/10 [22:26:15] - mistral.models.auto - INFO :: Initializing Custom GPT-2 Model from Configuration: `gpt2`...
+|=>> 12/10 [22:26:19] - mistral - INFO :: Setting Training Arguments from Quinfig...
+|=>> 12/10 [22:26:19] - mistral.args.training - INFO :: Setting Gradient Accumulation Steps = `64` [BSZ: 512 World Size: 1 Device BSZ: 8]
+|=>> 12/10 [22:26:19] - mistral - INFO :: Downloading and Preprocessing Dataset `/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py`...
+|=>> 12/10 [22:26:20] - datasets_modules.datasets.babylm_dataset.a557ecebcec5aabd33660301ba889db2028afe94218a3c44654a00198e95ea8b.babylm_dataset - INFO :: Generating examples from = /local/xiulyang/mission-impossible-language-models/data/multilingual/multilingual_data_perturbed/shuffle_control_de/train
+|=>> 12/10 [22:26:20] - datasets_modules.datasets.babylm_dataset.a557ecebcec5aabd33660301ba889db2028afe94218a3c44654a00198e95ea8b.babylm_dataset - INFO :: Total sentences: 0
+|=>> 12/10 [22:26:20] - datasets_modules.datasets.babylm_dataset.a557ecebcec5aabd33660301ba889db2028afe94218a3c44654a00198e95ea8b.babylm_dataset - INFO :: Loading pre-tokenized data
+|=>> 12/10 [22:26:20] - datasets_modules.datasets.babylm_dataset.a557ecebcec5aabd33660301ba889db2028afe94218a3c44654a00198e95ea8b.babylm_dataset - INFO :: Concatenating tokenized data using EOS token
+|=>> 12/10 [22:26:20] - datasets_modules.datasets.babylm_dataset.a557ecebcec5aabd33660301ba889db2028afe94218a3c44654a00198e95ea8b.babylm_dataset - INFO :: Chunking tokens into sublists of 1024
+|=>> 12/12 [14:18:21] - mistral - INFO :: Starting Run: shuffle_control_de_DE_randinit_seed53...
+|=>> 12/12 [14:18:21] - mistral - INFO :: Setting Random Seed to 53!
+|=>> 12/12 [14:18:21] - mistral - INFO :: Building Tokenize and Initializing `gpt2-small` via AutoModel/AutoConfig...
+|=>> 12/12 [14:18:21] - mistral - INFO :: Using Configs For Model From: /local/xiulyang/mission-impossible-language-models/mistral/conf/models/gpt2-small-50266-DE.json ...
+|=>> 12/12 [14:18:21] - mistral.models.auto - INFO :: Building Hugging Face GPT2Config from provided configs: {'activation_function': 'gelu_new', 'architectures': ['GPT2LMHeadModel'], 'attn_pdrop': 0.1, 'embd_pdrop': 0.1, 'eos_token_id': 50265, 'bos_token_id': 50265, 'initializer_range': 0.02, 'layer_norm_epsilon': 1e-05, 'model_type': 'gpt2', 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_inner': None, 'n_layer': 12, 'n_positions': 1024, 'reorder_and_upcast_attn': True, 'resid_pdrop': 0.1, 'scale_attn_by_inverse_layer_idx': True, 'scale_attn_weights': True, 'summary_activation': None, 'summary_first_dropout': 0.2, 'summary_proj_to_labels': True, 'summary_type': 'cls_index', 'summary_use_proj': True, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 1024}}, 'torch_dtype': 'float32', 'transformers_version': '4.35.2', 'use_cache': False, 'vocab_size': 50266} ...
+|=>> 12/12 [14:18:21] - mistral.models.auto - INFO :: Fetching Hugging Face [Fast] AutoTokenizer for Model: `gpt2`...
+|=>> 12/12 [14:18:21] - mistral.models.auto - INFO :: Using a Pretokenized Dataset
+|=>> 12/12 [14:18:21] - mistral.models.auto - INFO :: Initializing Custom GPT-2 Model from Configuration: `gpt2`...
+|=>> 12/12 [14:18:25] - mistral - INFO :: Setting Training Arguments from Quinfig...
+|=>> 12/12 [14:18:25] - mistral.args.training - INFO :: Setting Gradient Accumulation Steps = `64` [BSZ: 512 World Size: 1 Device BSZ: 8]
+|=>> 12/12 [14:18:25] - mistral - INFO :: Downloading and Preprocessing Dataset `/local/xiulyang/mission-impossible-language-models/training/babylm_dataset.py`...
+|=>> 12/12 [14:18:26] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Generating examples from = /local/xiulyang/mission-impossible-language-models/data/multilingual/multilingual_data_perturbed/shuffle_control_de/train
+|=>> 12/12 [14:18:26] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Total sentences: 1128954
+|=>> 12/12 [14:18:26] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Loading pre-tokenized data
+|=>> 12/12 [14:18:31] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Concatenating tokenized data using EOS token
+|=>> 12/12 [14:18:31] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Chunking tokens into sublists of 1024
+|=>> 12/12 [14:18:32] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Writing dataset as space-separated sequences of tokens
+|=>> 12/12 [14:18:35] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Generating examples from = /local/xiulyang/mission-impossible-language-models/data/multilingual/multilingual_data_perturbed/shuffle_control_de/dev
+|=>> 12/12 [14:18:35] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Total sentences: 6288
+|=>> 12/12 [14:18:35] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Loading pre-tokenized data
+|=>> 12/12 [14:18:35] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Concatenating tokenized data using EOS token
+|=>> 12/12 [14:18:35] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Chunking tokens into sublists of 1024
+|=>> 12/12 [14:18:35] - datasets_modules.datasets.babylm_dataset.43b5d6f57ff7566a9e5fca41a7eeab46164e1c1b30ed1dcb135a5808f8c10159.babylm_dataset - INFO :: Writing dataset as space-separated sequences of tokens
+|=>> 12/12 [14:18:35] - mistral.corpora.auto - INFO :: Building Tokenized Indexed Dataset for {dataset_id}/{dataset_name}...
+|=>> 12/12 [14:18:35] - mistral.corpora.auto - INFO :: Building Indexed Dataset for train
+|=>> 12/12 [14:19:05] - mistral.corpora.auto - INFO :: Building Indexed Dataset for validation
+|=>> 12/12 [14:19:05] - mistral - INFO :: Initializing Model Trainer...
+|=>> 12/12 [14:19:05] - mistral - INFO :: Training Arguments: TrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+bf16=False,
+bf16_full_eval=False,
+data_seed=53,
+dataloader_drop_last=False,
+dataloader_num_workers=0,
+dataloader_pin_memory=True,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+debug=[],
+deepspeed=None,
+disable_tqdm=False,
+do_eval=True,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=1000,
+evaluation_strategy=IntervalStrategy.STEPS,
+fp16=True,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+gradient_accumulation_steps=64,
+gradient_checkpointing=False,
+greater_is_better=None,
+group_by_length=False,
+half_precision_backend=auto,
+hub_model_id=None,
+hub_strategy=HubStrategy.EVERY_SAVE,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.0006,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=-1,
+log_level=-1,
+log_level_replica=-1,
+log_on_each_node=True,
+logging_dir=logs,
+logging_first_step=True,
+logging_nan_inf_filter=True,
+logging_steps=50,
+logging_strategy=IntervalStrategy.STEPS,
+lr_scheduler_type=SchedulerType.LINEAR,
+max_grad_norm=1.0,
+max_steps=1200,
+metric_for_best_model=None,
+mp_parameters=,
+no_cuda=False,
+num_train_epochs=3.0,
+optim=OptimizerNames.ADAMW_HF,
+output_dir=//local/xiulyang/babylm_models/shuffle_control_de_DE_randinit/babylm_shuffle_control_de_DE_randinit_seed53/runs/shuffle_control_de_DE_randinit_seed53,
+overwrite_output_dir=False,
+past_index=-1,
+per_device_eval_batch_size=16,
+per_device_train_batch_size=8,
+prediction_loss_only=True,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+remove_unused_columns=True,
+report_to=[],
+resume_from_checkpoint=None,
+run_name=shuffle_control_de_DE_randinit_seed53,
+save_on_each_node=False,
+save_steps=1000,
+save_strategy=IntervalStrategy.STEPS,
+save_total_limit=None,
+seed=53,
+sharded_ddp=[],
+skip_memory_metrics=True,
+tf32=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_legacy_prediction_loop=False,
+warmup_ratio=0.0,
+warmup_steps=120,
+weight_decay=0.1,
+xpu_backend=None,
+)
+|=>> 12/12 [14:19:05] - mistral.core.callbacks - INFO :: Setting W&B Project: xiulin-yang-compling
+|=>> 12/12 [14:19:08] - mistral - INFO :: Training...
+|=>> 12/12 [14:19:09] - mistral.core.callbacks - INFO :: Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
+|=>> 12/13 [03:14:46] - mistral - INFO :: ...and that's all folks!
+|=>> 12/13 [03:14:46] - mistral - INFO :: Running final evaluation...
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..00d0bb84ef853fda188d996b93c143bd905b3674
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1 @@
+{"tokenizer_class": "PassthroughTokenizer"}
\ No newline at end of file
diff --git a/training_args.bin b/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cb6a54a120d1853b894ff9ce3901ecd7e98d650e
--- /dev/null
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec32810d4409696d65bce9761313a0cffdb55152ef1e6f8c42d0c5245db5910b
+size 3183