mrfakename zRzRzRzRzRzRzR commited on
Commit
d8e7fb6
·
verified ·
0 Parent(s):

Duplicate from THUDM/GLM-Z1-9B-0414

Browse files

Co-authored-by: zR <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Zhipu AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - zh
5
+ - en
6
+ pipeline_tag: text-generation
7
+ library_name: transformers
8
+ ---
9
+
10
+ # GLM-4-Z1-9B-0414
11
+
12
+ ## Introduction
13
+
14
+ The GLM family welcomes a new generation of open-source models, the **GLM-4-32B-0414** series, featuring 32 billion parameters. Its performance is comparable to OpenAI's GPT series and DeepSeek's V3/R1 series, and it supports very user-friendly local deployment features. GLM-4-32B-Base-0414 was pre-trained on 15T of high-quality data, including a large amount of reasoning-type synthetic data, laying the foundation for subsequent reinforcement learning extensions. In the post-training stage, in addition to human preference alignment for dialogue scenarios, we also enhanced the model's performance in instruction following, engineering code, and function calling using techniques such as rejection sampling and reinforcement learning, strengthening the atomic capabilities required for agent tasks. GLM-4-32B-0414 achieves good results in areas such as engineering code, Artifact generation, function calling, search-based Q&A, and report generation. Some benchmarks even rival larger models like GPT-4o and DeepSeek-V3-0324 (671B).
15
+
16
+ **GLM-Z1-32B-0414** is a reasoning model with **deep thinking capabilities**. This was developed based on GLM-4-32B-0414 through cold start and extended reinforcement learning, as well as further training of the model on tasks involving mathematics, code, and logic. Compared to the base model, GLM-Z1-32B-0414 significantly improves mathematical abilities and the capability to solve complex tasks. During the training process, we also introduced general reinforcement learning based on pairwise ranking feedback, further enhancing the model's general capabilities.
17
+
18
+ **GLM-Z1-Rumination-32B-0414** is a deep reasoning model with **rumination capabilities** (benchmarked against OpenAI's Deep Research). Unlike typical deep thinking models, the rumination model employs longer periods of deep thought to solve more open-ended and complex problems (e.g., writing a comparative analysis of AI development in two cities and their future development plans). The rumination model integrates search tools during its deep thinking process to handle complex tasks and is trained by utilizing multiple rule-based rewards to guide and extend end-to-end reinforcement learning. Z1-Rumination shows significant improvements in research-style writing and complex retrieval tasks.
19
+
20
+ Finally, **GLM-Z1-9B-0414** is a surprise. We employed the aforementioned series of techniques to train a 9B small-sized model that maintains the open-source tradition. Despite its smaller scale, GLM-Z1-9B-0414 still exhibits excellent capabilities in mathematical reasoning and general tasks. Its overall performance is already at a leading level among open-source models of the same size. Especially in resource-constrained scenarios, this model achieves an excellent balance between efficiency and effectiveness, providing a powerful option for users seeking lightweight deployment.
21
+
22
+ ## Performance
23
+
24
+ <p align="center">
25
+ <img width="100%" src="https://raw.githubusercontent.com/THUDM/GLM-4/refs/heads/main/resources/Bench-Z1-32B.png">
26
+ </p>
27
+
28
+ <p align="center">
29
+ <img width="100%" src="https://raw.githubusercontent.com/THUDM/GLM-4/refs/heads/main/resources/Bench-Z1-9B.png">
30
+ </p>
31
+
32
+ ## Model Usage Guidelines
33
+
34
+ ### I. Sampling Parameters
35
+
36
+ | Parameter | Recommended Value | Description |
37
+ | ------------ | ----------------- | -------------------------------------------- |
38
+ | temperature | **0.6** | Balances creativity and stability |
39
+ | top_p | **0.95** | Cumulative probability threshold for sampling|
40
+ | top_k | **40** | Filters out rare tokens while maintaining diversity |
41
+ | max_new_tokens | **30000** | Leaves enough tokens for thinking |
42
+
43
+ ### II. Enforced Thinking
44
+
45
+ - Add \<think\>\n to the **first line**: Ensures the model thinks before responding
46
+ - When using `chat_template.jinja`, the prompt is automatically injected to enforce this behavior
47
+
48
+
49
+ ### III. Dialogue History Trimming
50
+
51
+ - Retain only the **final user-visible reply**.
52
+ Hidden thinking content should **not** be saved to history to reduce interference—this is already implemented in `chat_template.jinja`
53
+
54
+
55
+ ### IV. Handling Long Contexts (YaRN)
56
+
57
+ - When input length exceeds **8,192 tokens**, consider enabling YaRN (Rope Scaling)
58
+
59
+ - In supported frameworks, add the following snippet to `config.json`:
60
+
61
+ ```json
62
+ "rope_scaling": {
63
+ "type": "yarn",
64
+ "factor": 4.0,
65
+ "original_max_position_embeddings": 32768
66
+ }
67
+ ```
68
+
69
+ - **Static YaRN** applies uniformly to all text. It may slightly degrade performance on short texts, so enable as needed.
70
+
71
+ ## Inference Code
72
+
73
+ Make Sure Using `transforemrs>=4.51.3`.
74
+
75
+ ```python
76
+ from transformers import AutoModelForCausalLM, AutoTokenizer
77
+
78
+ MODEL_PATH = "THUDM/GLM-4-Z1-9B-0414"
79
+
80
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
81
+ model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto")
82
+
83
+ message = [{"role": "user", "content": "Let a, b be positive real numbers such that ab = a + b + 3. Determine the range of possible values for a + b."}]
84
+
85
+ inputs = tokenizer.apply_chat_template(
86
+ message,
87
+ return_tensors="pt",
88
+ add_generation_prompt=True,
89
+ return_dict=True,
90
+ ).to(model.device)
91
+
92
+ generate_kwargs = {
93
+ "input_ids": inputs["input_ids"],
94
+ "attention_mask": inputs["attention_mask"],
95
+ "max_new_tokens": 4096,
96
+ "do_sample": False,
97
+ }
98
+ out = model.generate(**generate_kwargs)
99
+ print(tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True))
100
+ ```
101
+
102
+ ## Citations
103
+
104
+ If you find our work useful, please consider citing the following paper.
105
+
106
+ ```
107
+ @misc{glm2024chatglm,
108
+ title={ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools},
109
+ author={Team GLM and Aohan Zeng and Bin Xu and Bowen Wang and Chenhui Zhang and Da Yin and Diego Rojas and Guanyu Feng and Hanlin Zhao and Hanyu Lai and Hao Yu and Hongning Wang and Jiadai Sun and Jiajie Zhang and Jiale Cheng and Jiayi Gui and Jie Tang and Jing Zhang and Juanzi Li and Lei Zhao and Lindong Wu and Lucen Zhong and Mingdao Liu and Minlie Huang and Peng Zhang and Qinkai Zheng and Rui Lu and Shuaiqi Duan and Shudan Zhang and Shulin Cao and Shuxun Yang and Weng Lam Tam and Wenyi Zhao and Xiao Liu and Xiao Xia and Xiaohan Zhang and Xiaotao Gu and Xin Lv and Xinghan Liu and Xinyi Liu and Xinyue Yang and Xixuan Song and Xunkai Zhang and Yifan An and Yifan Xu and Yilin Niu and Yuantao Yang and Yueyan Li and Yushi Bai and Yuxiao Dong and Zehan Qi and Zhaoyu Wang and Zhen Yang and Zhengxiao Du and Zhenyu Hou and Zihan Wang},
110
+ year={2024},
111
+ eprint={2406.12793},
112
+ archivePrefix={arXiv},
113
+ primaryClass={id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg' in_archive='cs' is_general=False description='Covers natural language processing. Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial languages (programming languages, logics, formal systems) that does not explicitly address natural-language issues broadly construed (natural-language processing, computational linguistics, speech, text retrieval, etc.) is not appropriate for this area.'}
114
+ }
115
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ 你是一个名为 ChatGLM 的人工智能助手。你是基于智谱 AI 公司训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。
5
+
6
+ # 可用工具
7
+ {%- for tool in tools %}
8
+ {%- set function = tool.function if tool.get("function") else tool %}
9
+
10
+ ## {{ function.name }}
11
+
12
+ {{ function | tojson(indent=4, ensure_ascii=False) }}
13
+ 在调用上述函数时,请使用 Json 格式表示调用的参数。
14
+ {%- endfor %}
15
+ {%- endif -%}
16
+
17
+ {%- for msg in messages %}
18
+ {%- if msg.role == 'system' %}
19
+ <|system|>
20
+ {{ msg.content }}
21
+ {%- endif %}
22
+ {%- endfor %}
23
+
24
+ {%- for message in messages if message.role != 'system' %}
25
+ {%- set role = message['role'] %}
26
+ {%- set content = message['content'] %}
27
+ {%- set visible = content.split('</think>')[-1].strip() %}
28
+ {%- set meta = message.get("metadata", "") %}
29
+
30
+ {%- if role == 'user' %}
31
+ <|user|>
32
+ {{ visible }}
33
+ {%- elif role == 'assistant' and not meta %}
34
+ <|assistant|>
35
+ {{ visible }}
36
+ {%- elif role == 'assistant' and meta %}
37
+ <|assistant|>{{ meta }}
38
+ {{ visible }}
39
+ {%- elif role == 'observation' %}
40
+ <|observation|>
41
+ {{ visible }}
42
+ {%- endif %}
43
+ {%- endfor %}
44
+ {% if add_generation_prompt %}<|assistant|>\n<think>{% endif %}
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4ForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": [
8
+ 151329,
9
+ 151336,
10
+ 151338
11
+ ],
12
+ "head_dim": 128,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 4096,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 13696,
17
+ "max_position_embeddings": 32768,
18
+ "model_type": "glm4",
19
+ "num_attention_heads": 32,
20
+ "num_hidden_layers": 40,
21
+ "num_key_value_heads": 2,
22
+ "pad_token_id": 151329,
23
+ "partial_rotary_factor": 0.5,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.52.0.dev0",
29
+ "use_cache": true,
30
+ "vocab_size": 151552
31
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-generation"}
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151329,
5
+ 151336,
6
+ 151338
7
+ ],
8
+ "pad_token_id": 151329,
9
+ "transformers_version": "4.52.0.dev0"
10
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e60b8e1d26b365252590099e996d66dcf75118bd1dd7e0ad209f90099f52321
3
+ size 4984283160
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:175f5395ee3a460cc88be729dda2628b3669fce5c7a081ffe347ef5437607e43
3
+ size 4895274600
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a628600a84d3654fc78b0fbe53dcac2a1e03c673f256b4518f635dfd25297a2b
3
+ size 4895274616
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8529f785e9b993f0e43dc24309b81fe3907c01f4ee0454ce8dce23fb354e96d7
3
+ size 4025786080
model.safetensors.index.json ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 18800558080
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
33
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
46
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
57
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
59
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
70
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
72
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
83
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
85
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
93
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
96
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
106
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
109
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.16.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
119
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
124
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.17.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.17.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
132
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.18.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
143
+ "model.layers.18.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
145
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
146
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
147
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
148
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
150
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.19.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
155
+ "model.layers.19.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.19.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
157
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
158
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
161
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
163
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
166
+ "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
167
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
168
+ "model.layers.2.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
169
+ "model.layers.2.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
170
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
171
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
172
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
173
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
174
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
175
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
176
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
177
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
178
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
179
+ "model.layers.20.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
181
+ "model.layers.20.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.20.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
184
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
185
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
186
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
187
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
188
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
189
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
190
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.21.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.21.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.21.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
197
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
198
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
199
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
200
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
201
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
202
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
203
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.22.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.22.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.22.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
213
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
215
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.23.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.23.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.23.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
226
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
228
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.24.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.24.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.24.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
239
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
241
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.25.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.25.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.25.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
249
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
252
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.26.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.26.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.26.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
262
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
265
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
267
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
270
+ "model.layers.27.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
271
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
272
+ "model.layers.27.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
273
+ "model.layers.27.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
275
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
278
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
280
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.28.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.28.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.28.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
287
+ "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
288
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
289
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
291
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
293
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.29.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
297
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.29.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.29.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
301
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
303
+ "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
304
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
306
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.3.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.3.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.30.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.30.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.30.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
327
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
330
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
331
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
332
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
333
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
335
+ "model.layers.31.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
336
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
337
+ "model.layers.31.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.31.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
339
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
340
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
342
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
343
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
344
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
345
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
346
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
347
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
348
+ "model.layers.32.mlp.gate_up_proj.weight": "model-00003-of-00004.safetensors",
349
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
350
+ "model.layers.32.post_mlp_layernorm.weight": "model-00003-of-00004.safetensors",
351
+ "model.layers.32.post_self_attn_layernorm.weight": "model-00003-of-00004.safetensors",
352
+ "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
353
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
354
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
355
+ "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
356
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
357
+ "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
358
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
359
+ "model.layers.33.input_layernorm.weight": "model-00004-of-00004.safetensors",
360
+ "model.layers.33.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
361
+ "model.layers.33.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
362
+ "model.layers.33.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
363
+ "model.layers.33.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
364
+ "model.layers.33.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
365
+ "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
366
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
367
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
368
+ "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
369
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
370
+ "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
371
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
372
+ "model.layers.34.input_layernorm.weight": "model-00004-of-00004.safetensors",
373
+ "model.layers.34.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
374
+ "model.layers.34.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
375
+ "model.layers.34.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
376
+ "model.layers.34.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
377
+ "model.layers.34.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
378
+ "model.layers.34.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
379
+ "model.layers.34.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
380
+ "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
381
+ "model.layers.34.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
382
+ "model.layers.34.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
383
+ "model.layers.34.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
384
+ "model.layers.34.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
385
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
386
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
387
+ "model.layers.35.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
388
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
389
+ "model.layers.35.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
390
+ "model.layers.35.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
391
+ "model.layers.35.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
392
+ "model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
393
+ "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
394
+ "model.layers.35.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
395
+ "model.layers.35.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
396
+ "model.layers.35.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
397
+ "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
398
+ "model.layers.36.input_layernorm.weight": "model-00004-of-00004.safetensors",
399
+ "model.layers.36.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
400
+ "model.layers.36.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
401
+ "model.layers.36.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
402
+ "model.layers.36.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
403
+ "model.layers.36.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
404
+ "model.layers.36.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
405
+ "model.layers.36.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
406
+ "model.layers.36.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
407
+ "model.layers.36.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
408
+ "model.layers.36.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
409
+ "model.layers.36.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
410
+ "model.layers.36.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
411
+ "model.layers.37.input_layernorm.weight": "model-00004-of-00004.safetensors",
412
+ "model.layers.37.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
413
+ "model.layers.37.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
414
+ "model.layers.37.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
415
+ "model.layers.37.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
416
+ "model.layers.37.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
417
+ "model.layers.37.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
418
+ "model.layers.37.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
419
+ "model.layers.37.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
420
+ "model.layers.37.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
421
+ "model.layers.37.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
422
+ "model.layers.37.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
423
+ "model.layers.37.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
424
+ "model.layers.38.input_layernorm.weight": "model-00004-of-00004.safetensors",
425
+ "model.layers.38.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
426
+ "model.layers.38.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
427
+ "model.layers.38.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
428
+ "model.layers.38.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
429
+ "model.layers.38.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
430
+ "model.layers.38.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
431
+ "model.layers.38.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
432
+ "model.layers.38.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
433
+ "model.layers.38.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
434
+ "model.layers.38.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
435
+ "model.layers.38.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
436
+ "model.layers.38.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
437
+ "model.layers.39.input_layernorm.weight": "model-00004-of-00004.safetensors",
438
+ "model.layers.39.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
439
+ "model.layers.39.mlp.gate_up_proj.weight": "model-00004-of-00004.safetensors",
440
+ "model.layers.39.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
441
+ "model.layers.39.post_mlp_layernorm.weight": "model-00004-of-00004.safetensors",
442
+ "model.layers.39.post_self_attn_layernorm.weight": "model-00004-of-00004.safetensors",
443
+ "model.layers.39.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
444
+ "model.layers.39.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
445
+ "model.layers.39.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
446
+ "model.layers.39.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
447
+ "model.layers.39.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
448
+ "model.layers.39.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
449
+ "model.layers.39.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
450
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
451
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
452
+ "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
453
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
454
+ "model.layers.4.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
455
+ "model.layers.4.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
456
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
457
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
458
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
459
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
460
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
461
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
462
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
463
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
464
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
465
+ "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
466
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
467
+ "model.layers.5.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
468
+ "model.layers.5.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
469
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
470
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
471
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
472
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
473
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
474
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
475
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
476
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
477
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
478
+ "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
479
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
480
+ "model.layers.6.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
481
+ "model.layers.6.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
482
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
483
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
484
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
485
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
486
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
487
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
488
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
489
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
490
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
491
+ "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
492
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
493
+ "model.layers.7.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
494
+ "model.layers.7.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
495
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
496
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
497
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
498
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
499
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
500
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
501
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
502
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
503
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
504
+ "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00004.safetensors",
505
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
506
+ "model.layers.8.post_mlp_layernorm.weight": "model-00001-of-00004.safetensors",
507
+ "model.layers.8.post_self_attn_layernorm.weight": "model-00001-of-00004.safetensors",
508
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
509
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
510
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
511
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
512
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
513
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
514
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
515
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
516
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
517
+ "model.layers.9.mlp.gate_up_proj.weight": "model-00002-of-00004.safetensors",
518
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
519
+ "model.layers.9.post_mlp_layernorm.weight": "model-00002-of-00004.safetensors",
520
+ "model.layers.9.post_self_attn_layernorm.weight": "model-00002-of-00004.safetensors",
521
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
522
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
523
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
524
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
525
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
526
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
527
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
528
+ "model.norm.weight": "model-00004-of-00004.safetensors"
529
+ }
530
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
3
+ size 19966496
tokenizer_config.json ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151329": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151330": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151331": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151332": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151333": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151334": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151335": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151336": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151337": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151338": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151339": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151340": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151341": {
100
+ "content": "<|begin_of_video|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151342": {
108
+ "content": "<|end_of_video|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ }
115
+ },
116
+ "additional_special_tokens": [
117
+ "<|endoftext|>",
118
+ "[MASK]",
119
+ "[gMASK]",
120
+ "[sMASK]",
121
+ "<sop>",
122
+ "<eop>",
123
+ "<|system|>",
124
+ "<|user|>",
125
+ "<|assistant|>",
126
+ "<|observation|>",
127
+ "<|begin_of_image|>",
128
+ "<|end_of_image|>",
129
+ "<|begin_of_video|>",
130
+ "<|end_of_video|>"
131
+ ],
132
+ "chat_template": "[gMASK]<sop>{%- if tools -%}<|system|>你是一个名为 ChatGLM 的人工智能助手。你是基于智谱 AI 公司训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具\n\n{% for tool in tools %}{%- set function = tool.function if tool.get(\"function\") else tool %}\n\n## {{ function.name }}\n\n{{ function | tojson(indent=4, ensure_ascii=False) }}\n在调用上述函数时,请使用 Json 格式表示调用的参数。{%- endfor %}{%- endif -%}{%- for msg in messages %}{%- if msg.role == 'system' %}<|system|>\n{{ msg.content }}{%- endif %}{%- endfor %}{%- for message in messages if message.role != 'system' %}{%- set role = message['role'] %}{%- set content = message['content'] %}{%- set visible = content.split('</think>')[-1].strip() %}{%- set meta = message.get(\"metadata\", \"\") %}{%- if role == 'user' %}<|user|>\n{{ visible }}{%- elif role == 'assistant' and not meta %}<|assistant|>\n{{ visible }}{%- elif role == 'assistant' and meta %}<|assistant|>{{ meta }} \n{{ visible }}{%- elif role == 'observation' %}<|observation|>\n{{ visible }}{%- endif %}{%- endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
133
+ "clean_up_tokenization_spaces": false,
134
+ "do_lower_case": false,
135
+ "eos_token": "<|endoftext|>",
136
+ "extra_special_tokens": {},
137
+ "model_input_names": [
138
+ "input_ids",
139
+ "attention_mask"
140
+ ],
141
+ "model_max_length": 128000,
142
+ "pad_token": "<|endoftext|>",
143
+ "padding_side": "left",
144
+ "remove_space": false,
145
+ "tokenizer_class": "PreTrainedTokenizer"
146
+ }