diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..aece861d0dd177393179c6d0d9a1f8ff0e664ce9 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e91d84b2af78a4ec3aa5f37f91a0d45e6f4e13e3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,69 @@
+---
+base_model: trashpanda-org/Llama3-24B-Mullein-v1
+library_name: transformers
+model_name: Gullein
+tags:
+- generated_from_trainer
+- axolotl
+- trl
+- grpo
+licence: license
+---
+
+# Model Card for Gullein
+
+This model is a fine-tuned version of [trashpanda-org/Llama3-24B-Mullein-v1](https://huggingface.co/trashpanda-org/Llama3-24B-Mullein-v1).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="trashpanda-org/Gullein", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[
](https://wandb.ai/sjwang05-personal/greg-grpo/runs/0a95uxly)
+
+
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+### Framework versions
+
+- TRL: 0.16.1
+- Transformers: 4.50.0.dev0
+- Pytorch: 2.6.0
+- Datasets: 3.5.0
+- Tokenizers: 0.21.1
+
+## Citations
+
+Cite GRPO as:
+
+```bibtex
+@article{zhihong2024deepseekmath,
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+ year = 2024,
+ eprint = {arXiv:2402.03300},
+}
+
+```
+
+Cite TRL as:
+
+```bibtex
+@misc{vonwerra2022trl,
+ title = {{TRL: Transformer Reinforcement Learning}},
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+ year = 2020,
+ journal = {GitHub repository},
+ publisher = {GitHub},
+ howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
\ No newline at end of file
diff --git a/checkpoint-400/config.json b/checkpoint-400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c26b5aeb19eceb28981abb0cbc2f345e0e57e453
--- /dev/null
+++ b/checkpoint-400/config.json
@@ -0,0 +1,28 @@
+{
+ "architectures": [
+ "MistralForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 5120,
+ "initializer_range": 0.02,
+ "intermediate_size": 32768,
+ "max_position_embeddings": 32768,
+ "model_type": "mistral",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 40,
+ "num_key_value_heads": 8,
+ "pad_token_id": 11,
+ "rms_norm_eps": 1e-05,
+ "rope_theta": 100000000.0,
+ "sliding_window": null,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.50.0.dev0",
+ "unsloth_fixed": true,
+ "use_cache": false,
+ "vocab_size": 131072
+}
diff --git a/checkpoint-400/generation_config.json b/checkpoint-400/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f67e415ceed468654cfb334307983cb5afdba74
--- /dev/null
+++ b/checkpoint-400/generation_config.json
@@ -0,0 +1,9 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "do_sample": true,
+ "eos_token_id": 2,
+ "max_length": 32768,
+ "pad_token_id": 11,
+ "transformers_version": "4.50.0.dev0"
+}
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6fdbf2a06efde2dd68373e5bcf06bff651fe3a8e
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bc7e31bed9972d413cf7bedba5c91a213ac6d8362bb384895dae7ba350d0591
+size 20311124931
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5674adc3fa84f65041e8be1a767499326e51737b
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f49d1d175705227da4a46bb6920d6b7e7e7b48c76f964310b74e5d34be71c7a7
+size 20311124931
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08fb610be21104cb2b8ef1457d4cac3ccaad238a
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a5a69789006cb218dba788075d67c21c504a0bcdf46649689c988dae0a02111
+size 20311124931
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ad96eafeacf61c2b68e3d62632a70a5e0360550a
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7844b54eef513024f037e19aef465a0c588e59da19ed0725a209259c4ec1ee1
+size 20311124931
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f811915c619ccd25000cafec37b597927944fd48
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47721e50a92ce9c0663624581bb86365ad39e0fc92bf28bea6bebc1b729c963b
+size 20311124931
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ab6814ff4bb26cd6f5bbfe8730a3f28384ab4b80
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc0f02cc8c42aa9a5effe480a1b92397561d3c01ee3183dcbe3bf06424660a74
+size 20311124931
diff --git a/checkpoint-400/global_step400/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-400/global_step400/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08df495308de65caf94b9188069cf9f237013c9d
--- /dev/null
+++ b/checkpoint-400/global_step400/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aba0705cd965e726a4a278c86a5d03f5ef448856b63a3f3c18461f7e05e666a7
+size 20311124931
diff --git a/checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00cca730481d15b872d8a300f1623f4621a5f4a7
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62b860ddcabd2ff757b0e9541b9ef17909e52c5b001cab4377485c2d81de86e5
+size 187277
diff --git a/checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9d3061ba6c932a38499355bd282dadeb2284fd17
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_1_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd55750ae48de0ffcc217558657b767863c9d0e77c08df18ac174440920e9a5c
+size 187277
diff --git a/checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4012e6799cd41974e7c24301a589ca296114f313
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_2_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aba3fde704f91abb752244ffa642b335b3fe4060615ba398280b8e3a458d0e31
+size 187277
diff --git a/checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..94645015c98b9018f9a46707a96ec7156e14bb75
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_3_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019a542a26a616e6ac409c16c376f4db36c6dc86a5f87e46263b44bd5d4eb19a
+size 187277
diff --git a/checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d2b379e7f08f8e4796f1140a8995dda8adf34c3f
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_4_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10ff5df5c5659a20ad81ed030487ea498c139ce4e53aa64b75856f9c462ed6ae
+size 187277
diff --git a/checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..16f624733b13127b08f653e7c2090b4f82069e8e
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_5_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6faef55695fd60511ebb3bd45fd73fb5a16efc343fe39e9ca5fd9c338eb3c6d
+size 187277
diff --git a/checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..59860edf6ca2cb7cb86a2789db99aca8b2f771d8
--- /dev/null
+++ b/checkpoint-400/global_step400/zero_pp_rank_6_mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65d26e053b538d2b2c4befe9b43b7dfb214bacdc1c0fd7d249cffb26c828ba31
+size 187277
diff --git a/checkpoint-400/latest b/checkpoint-400/latest
new file mode 100644
index 0000000000000000000000000000000000000000..e5bdf58d4f29d34e909da25905fad376f73e7c29
--- /dev/null
+++ b/checkpoint-400/latest
@@ -0,0 +1 @@
+global_step400
\ No newline at end of file
diff --git a/checkpoint-400/model-00001-of-00010.safetensors b/checkpoint-400/model-00001-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d69da1e70bc946956fcf839b8b5cfb14bd725bad
--- /dev/null
+++ b/checkpoint-400/model-00001-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:341120b3ffa785723ab984164c46e831c943c924b41b6b8403dcfa15bc45a4d8
+size 4781571736
diff --git a/checkpoint-400/model-00002-of-00010.safetensors b/checkpoint-400/model-00002-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0472816cc82c7d067f381880ab05c002b7ff680b
--- /dev/null
+++ b/checkpoint-400/model-00002-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:936f99be0a90fd10d1ee3868b327792339f38ded0971ce0e396c78e7a688d503
+size 4781592784
diff --git a/checkpoint-400/model-00003-of-00010.safetensors b/checkpoint-400/model-00003-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6f606c58aa1660f661bf691813e850815b05cd69
--- /dev/null
+++ b/checkpoint-400/model-00003-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fde082e6f88ebbd993d4829663861e818a49630c2776ef34f4429a21b2eeb7b
+size 4781592800
diff --git a/checkpoint-400/model-00004-of-00010.safetensors b/checkpoint-400/model-00004-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..89b088bccfd51662ca17321857196504cf935178
--- /dev/null
+++ b/checkpoint-400/model-00004-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1361ffee866888fb93abf080b867acd0887a5c43eceac5dfd5bdf319a0977a
+size 4886471600
diff --git a/checkpoint-400/model-00005-of-00010.safetensors b/checkpoint-400/model-00005-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..41f5a6683f200a97fc7aaccbd7d5d96b215a773b
--- /dev/null
+++ b/checkpoint-400/model-00005-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d215f13feede6e787ad7f3371d8576a85c0d33af430571bb1bb7e21e692b01b
+size 4781592824
diff --git a/checkpoint-400/model-00006-of-00010.safetensors b/checkpoint-400/model-00006-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..424b8a5d382f83d68f51bec19105f35a2873e882
--- /dev/null
+++ b/checkpoint-400/model-00006-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0aa6b08c60e53e9041ec247484fe65b42411397894131b0f020f333cc3c96d1
+size 4781592816
diff --git a/checkpoint-400/model-00007-of-00010.safetensors b/checkpoint-400/model-00007-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c1fa1caa89544a92de6323a3030455c2a3758dcd
--- /dev/null
+++ b/checkpoint-400/model-00007-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bb2c9f58847406d8d018cc6efe7c719e14734859302b30e125a69a978c1d37b
+size 4886471600
diff --git a/checkpoint-400/model-00008-of-00010.safetensors b/checkpoint-400/model-00008-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7a5ade633ca9fcd8ab97cab5aff395168cf06c4b
--- /dev/null
+++ b/checkpoint-400/model-00008-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:631fbc6d4565deb46663cb28cb83235fb0f3ec745109cd28c1e6aab4f21a2a5e
+size 4781592824
diff --git a/checkpoint-400/model-00009-of-00010.safetensors b/checkpoint-400/model-00009-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5e2d05a9fcdd086d2429cea44a21e15250c31b7f
--- /dev/null
+++ b/checkpoint-400/model-00009-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cba6f98ba3a09acdd33eeec30d0d816790298d7fdeff856f64cd3a2e36193779
+size 4781592816
diff --git a/checkpoint-400/model-00010-of-00010.safetensors b/checkpoint-400/model-00010-of-00010.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0ed82bf1c0c50c6ed06df2476712462b1995524b
--- /dev/null
+++ b/checkpoint-400/model-00010-of-00010.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20e578df9b950927387edac80e15fd049aa590bb03b7a42afb5179b75ee8b31b
+size 3900777072
diff --git a/checkpoint-400/model.safetensors.index.json b/checkpoint-400/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..fea625e56294eba8231f10d76c291f8fc78a3dbd
--- /dev/null
+++ b/checkpoint-400/model.safetensors.index.json
@@ -0,0 +1,370 @@
+{
+ "metadata": {
+ "total_size": 47144806400
+ },
+ "weight_map": {
+ "lm_head.weight": "model-00010-of-00010.safetensors",
+ "model.embed_tokens.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.10.input_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.mlp.down_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.mlp.up_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.11.input_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.11.mlp.down_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.11.mlp.up_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.12.input_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.mlp.down_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.mlp.up_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.input_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.mlp.down_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.mlp.gate_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.mlp.up_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.13.self_attn.v_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.input_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.mlp.down_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.mlp.up_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.input_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.mlp.down_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.16.input_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.16.mlp.down_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.16.mlp.up_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00010.safetensors",
+ "model.layers.17.input_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.mlp.down_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.mlp.up_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.17.self_attn.v_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.input_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.mlp.down_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.mlp.up_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.18.self_attn.v_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.input_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.mlp.down_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.mlp.up_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.20.input_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.20.mlp.down_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.20.mlp.up_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.20.post_attention_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00010.safetensors",
+ "model.layers.21.input_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.mlp.down_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.mlp.gate_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.mlp.up_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.21.self_attn.v_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.input_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.mlp.down_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.mlp.up_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.post_attention_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.22.self_attn.v_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.input_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.mlp.down_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.mlp.gate_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.mlp.up_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.23.self_attn.v_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.24.input_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.24.mlp.down_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.24.mlp.up_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.24.post_attention_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.24.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.24.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.24.self_attn.v_proj.weight": "model-00006-of-00010.safetensors",
+ "model.layers.25.input_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.mlp.down_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.mlp.gate_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.mlp.up_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.post_attention_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.25.self_attn.v_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.input_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.mlp.down_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.mlp.gate_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.mlp.up_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.post_attention_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.26.self_attn.v_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.input_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.mlp.down_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.mlp.gate_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.mlp.up_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.post_attention_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.27.self_attn.v_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.input_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.mlp.down_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.mlp.gate_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.mlp.up_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.post_attention_layernorm.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.28.self_attn.v_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.29.input_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.29.mlp.down_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.29.mlp.gate_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.29.mlp.up_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.29.post_attention_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.29.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.29.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.29.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.29.self_attn.v_proj.weight": "model-00007-of-00010.safetensors",
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00010.safetensors",
+ "model.layers.30.input_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.mlp.down_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.mlp.gate_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.mlp.up_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.post_attention_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.30.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.input_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.mlp.down_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.mlp.gate_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.mlp.up_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.post_attention_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.31.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.input_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.mlp.down_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.mlp.gate_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.mlp.up_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.post_attention_layernorm.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.32.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.33.input_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.33.mlp.down_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.33.mlp.gate_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.33.mlp.up_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.33.post_attention_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.33.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.33.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.33.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.33.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
+ "model.layers.34.input_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.mlp.down_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.mlp.gate_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.mlp.up_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.post_attention_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.34.self_attn.v_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.input_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.mlp.down_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.mlp.gate_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.mlp.up_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.post_attention_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.35.self_attn.v_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.input_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.mlp.down_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.mlp.gate_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.mlp.up_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.post_attention_layernorm.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.36.self_attn.v_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.37.input_layernorm.weight": "model-00010-of-00010.safetensors",
+ "model.layers.37.mlp.down_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.37.mlp.gate_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.37.mlp.up_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.37.post_attention_layernorm.weight": "model-00010-of-00010.safetensors",
+ "model.layers.37.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.37.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.37.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.37.self_attn.v_proj.weight": "model-00009-of-00010.safetensors",
+ "model.layers.38.input_layernorm.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.mlp.down_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.mlp.gate_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.mlp.up_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.post_attention_layernorm.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.self_attn.k_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.self_attn.o_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.self_attn.q_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.38.self_attn.v_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.input_layernorm.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.mlp.down_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.mlp.gate_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.mlp.up_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.post_attention_layernorm.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.self_attn.k_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.self_attn.o_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.self_attn.q_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.39.self_attn.v_proj.weight": "model-00010-of-00010.safetensors",
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.7.input_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.7.mlp.down_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.7.mlp.up_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00010.safetensors",
+ "model.layers.8.input_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.mlp.down_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.mlp.up_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
+ "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00010.safetensors",
+ "model.norm.weight": "model-00010-of-00010.safetensors"
+ }
+}
diff --git a/checkpoint-400/rng_state_0.pth b/checkpoint-400/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b6473612e41c5cfd6973c2e71fa5f3ad2b2bcad1
--- /dev/null
+++ b/checkpoint-400/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:575119a228f98110923ffa2dedcb50e3317251b26054355d015e0b2240d566f2
+size 15984
diff --git a/checkpoint-400/rng_state_1.pth b/checkpoint-400/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..644d27d88cd0808b87009b6b79ba141b182795b1
--- /dev/null
+++ b/checkpoint-400/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc091d2c64e295da198fc50a521da2b2e71efaede488211abd625b6121c779b1
+size 15920
diff --git a/checkpoint-400/rng_state_2.pth b/checkpoint-400/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..51a90e0f5c88c4d53cea095de92f8b3a33554e6c
--- /dev/null
+++ b/checkpoint-400/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f912bfa89ff34adaabe74ec6c417b1769ef0ddca0aad75e7c95b9d6ac616051f
+size 15920
diff --git a/checkpoint-400/rng_state_3.pth b/checkpoint-400/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2c9ad846777abcbbf4166825bf301042cc61d4e3
--- /dev/null
+++ b/checkpoint-400/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70f9f41c64234ad299aed7714f9e96d7324ce92432513b5ce6d5220c09e61613
+size 15984
diff --git a/checkpoint-400/rng_state_4.pth b/checkpoint-400/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d99483292d1ebd3c0d986f8accb397fec356bfe2
--- /dev/null
+++ b/checkpoint-400/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01d2cce8f0c97c1d155c59ed8c7e97477a25f08b3645ee125e8a3f76a47b15b9
+size 15984
diff --git a/checkpoint-400/rng_state_5.pth b/checkpoint-400/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ec7ac73579b204be367e8ee8cb9f9254c30a0bac
--- /dev/null
+++ b/checkpoint-400/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c22c59d534fded5bbba3e7c10e6a84de89c6d7483b7282c67b689ec7d387a1f
+size 15984
diff --git a/checkpoint-400/rng_state_6.pth b/checkpoint-400/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bb4af9e8818eda5ac5d213e1635c4202fd41bdf2
--- /dev/null
+++ b/checkpoint-400/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a778a51823224b752bd30584c37e64773becf6980befa7bf4928568bc6306899
+size 15984
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eb18b4ea2854e5632cab5f256fd1f485dec2a44c
--- /dev/null
+++ b/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7f146e427e8bc70e8f2a8515e9b2cda6f2c96e111c734dc72553da177aebf46
+size 1064
diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..a47054b449916f808ab94e6d2375c00ead78cb8f
--- /dev/null
+++ b/checkpoint-400/special_tokens_map.json
@@ -0,0 +1,1032 @@
+{
+ "additional_special_tokens": [
+ "",
+ "",
+ "",
+ "[INST]",
+ "[/INST]",
+ "[AVAILABLE_TOOLS]",
+ "[/AVAILABLE_TOOLS]",
+ "[TOOL_RESULTS]",
+ "[/TOOL_RESULTS]",
+ "[TOOL_CALLS]",
+ "[IMG]",
+ "",
+ "[IMG_BREAK]",
+ "[IMG_END]",
+ "[PREFIX]",
+ "[MIDDLE]",
+ "[SUFFIX]",
+ "[SYSTEM_PROMPT]",
+ "[/SYSTEM_PROMPT]",
+ "[TOOL_CONTENT]",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "