QwQZh commited on
Commit
aad415c
·
1 Parent(s): fc269d2
1B_baseline/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_qwen3.Qwen3Config",
7
+ "AutoModel": "modeling_qwen3.Qwen3Model",
8
+ "AutoModelForCausalLM": "modeling_qwen3.Qwen3ForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "elementwise_attn_output_gate": false,
12
+ "headwise_attn_output_gate": false,
13
+ "bos_token_id": 151643,
14
+ "eos_token_id": 151643,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 6144,
19
+ "max_position_embeddings": 32768,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen2",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 28,
24
+ "num_key_value_heads": 8,
25
+ "rms_norm_eps": 1e-06,
26
+ "rope_theta": 1000000,
27
+ "sliding_window": null,
28
+ "tie_word_embeddings": true,
29
+ "torch_dtype": "bfloat16",
30
+ "transformers_version": "4.46.0",
31
+ "use_cache": true,
32
+ "use_qk_norm": true,
33
+ "qkv_bias": false,
34
+ "use_sliding_window": false,
35
+ "vocab_size": 152064
36
+ }
1B_baseline/configuration_qwen3.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Lopyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
10
+ #
11
+ # Licensed under the Apache License, Version 2.0 (the "License");
12
+ # you may not use this file except in compliance with the License.
13
+ # You may obtain a copy of the License at
14
+ #
15
+ # http://www.apache.org/licenses/LICENSE-2.0
16
+ #
17
+ # Unless required by applicable law or agreed to in writing, software
18
+ # distributed under the License is distributed on an "AS IS" BASIS,
19
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
+ # See the License for the specific language governing permissions and
21
+ # limitations under the License.
22
+ """Qwen3 model configuration"""
23
+
24
+ from transformers.configuration_utils import PretrainedConfig
25
+ from transformers.modeling_rope_utils import rope_config_validation
26
+ from transformers.utils import logging
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class Qwen3Config(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
34
+ Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
35
+ with the defaults will yield a similar configuration to that of
36
+ Qwen3-8B-beta [Qwen/Qwen3-8B-beta](https://huggingface.co/Qwen/Qwen3-8B-beta).
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+
42
+ Args:
43
+ vocab_size (`int`, *optional*, defaults to 151936):
44
+ Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
45
+ `inputs_ids` passed when calling [`Qwen3Model`]
46
+ hidden_size (`int`, *optional*, defaults to 4096):
47
+ Dimension of the hidden representations.
48
+ intermediate_size (`int`, *optional*, defaults to 22016):
49
+ Dimension of the MLP representations.
50
+ num_hidden_layers (`int`, *optional*, defaults to 32):
51
+ Number of hidden layers in the Transformer encoder.
52
+ num_attention_heads (`int`, *optional*, defaults to 32):
53
+ Number of attention heads for each attention layer in the Transformer encoder.
54
+ num_key_value_heads (`int`, *optional*, defaults to 32):
55
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
56
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
57
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
58
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
59
+ by meanpooling all the original heads within that group. For more details checkout [this
60
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
+ The non-linear activation function (function or string) in the decoder.
63
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
64
+ The maximum sequence length that this model might ever be used with.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
73
+ Whether the model's input and output word embeddings should be tied.
74
+ rope_theta (`float`, *optional*, defaults to 10000.0):
75
+ The base period of the RoPE embeddings.
76
+ rope_scaling (`Dict`, *optional*):
77
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
78
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
79
+ accordingly.
80
+ Expected contents:
81
+ `rope_type` (`str`):
82
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
83
+ 'llama3'], with 'default' being the original RoPE implementation.
84
+ `factor` (`float`, *optional*):
85
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
86
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
87
+ original maximum pre-trained length.
88
+ `original_max_position_embeddings` (`int`, *optional*):
89
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
90
+ pretraining.
91
+ `attention_factor` (`float`, *optional*):
92
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
93
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
94
+ `factor` field to infer the suggested value.
95
+ `beta_fast` (`float`, *optional*):
96
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
97
+ ramp function. If unspecified, it defaults to 32.
98
+ `beta_slow` (`float`, *optional*):
99
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
100
+ ramp function. If unspecified, it defaults to 1.
101
+ `short_factor` (`List[float]`, *optional*):
102
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
103
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
104
+ size divided by the number of attention heads divided by 2
105
+ `long_factor` (`List[float]`, *optional*):
106
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
107
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
108
+ size divided by the number of attention heads divided by 2
109
+ `low_freq_factor` (`float`, *optional*):
110
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
111
+ `high_freq_factor` (`float`, *optional*):
112
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
113
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
114
+ Whether to use sliding window attention.
115
+ sliding_window (`int`, *optional*, defaults to 4096):
116
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
117
+ max_window_layers (`int`, *optional*, defaults to 28):
118
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
119
+ attention_bias (`bool`, *optional*, defaults to `False`):
120
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
121
+ attention_dropout (`float`, *optional*, defaults to 0.0):
122
+ The dropout ratio for the attention probabilities.
123
+ use_qk_norm (`bool`, *optional*, defaults to `False`):
124
+ Whether query and key in attention use norm
125
+ ```python
126
+ >>> from transformers import Qwen3Model, Qwen3Config
127
+
128
+ >>> # Initializing a Qwen3 style configuration
129
+ >>> configuration = Qwen3Config()
130
+
131
+ >>> # Initializing a model from the Qwen3-8B style configuration
132
+ >>> model = Qwen3Model(configuration)
133
+
134
+ >>> # Accessing the model configuration
135
+ >>> configuration = model.config
136
+ ```"""
137
+
138
+ model_type = "qwen3"
139
+ keys_to_ignore_at_inference = ["past_key_values"]
140
+
141
+ # Default tensor parallel plan for base model `Qwen3`
142
+ base_model_tp_plan = {
143
+ "layers.*.self_attn.q_proj": "colwise",
144
+ "layers.*.self_attn.k_proj": "colwise",
145
+ "layers.*.self_attn.v_proj": "colwise",
146
+ "layers.*.self_attn.o_proj": "rowwise",
147
+ "layers.*.mlp.gate_proj": "colwise",
148
+ "layers.*.mlp.up_proj": "colwise",
149
+ "layers.*.mlp.down_proj": "rowwise",
150
+ }
151
+
152
+ def __init__(
153
+ self,
154
+ vocab_size=151936,
155
+ hidden_size=4096,
156
+ intermediate_size=22016,
157
+ num_hidden_layers=32,
158
+ num_attention_heads=32,
159
+ num_key_value_heads=32,
160
+ head_dim=128,
161
+ hidden_act="silu",
162
+ max_position_embeddings=32768,
163
+ initializer_range=0.02,
164
+ rms_norm_eps=1e-6,
165
+ use_cache=True,
166
+ tie_word_embeddings=False,
167
+ rope_theta=10000.0,
168
+ rope_scaling=None,
169
+ use_sliding_window=False,
170
+ sliding_window=4096,
171
+ max_window_layers=28,
172
+ attention_bias=False,
173
+ attention_dropout=0.0,
174
+ use_qk_norm=True,
175
+ elementwise_attn_output_gate=False,
176
+ headwise_attn_output_gate=False,
177
+ **kwargs,
178
+ ):
179
+ self.vocab_size = vocab_size
180
+ self.max_position_embeddings = max_position_embeddings
181
+ self.hidden_size = hidden_size
182
+ self.intermediate_size = intermediate_size
183
+ self.num_hidden_layers = num_hidden_layers
184
+ self.num_attention_heads = num_attention_heads
185
+ self.head_dim = head_dim
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window if use_sliding_window else None
188
+ self.max_window_layers = max_window_layers
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.hidden_act = hidden_act
196
+ self.initializer_range = initializer_range
197
+ self.rms_norm_eps = rms_norm_eps
198
+ self.use_cache = use_cache
199
+ self.rope_theta = rope_theta
200
+ self.rope_scaling = rope_scaling
201
+ self.attention_bias = attention_bias
202
+ self.attention_dropout = attention_dropout
203
+ self.use_qk_norm = use_qk_norm
204
+
205
+ self.headwise_attn_output_gate = headwise_attn_output_gate
206
+ self.elementwise_attn_output_gate = elementwise_attn_output_gate
207
+
208
+ # Validate the correctness of rotary position embeddings parameters
209
+ # BC: if there is a 'type' field, move it to 'rope_type'.
210
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
211
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
212
+ rope_config_validation(self)
213
+
214
+ super().__init__(
215
+ tie_word_embeddings=tie_word_embeddings,
216
+ **kwargs,
217
+ )
218
+
1B_baseline/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": false,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.37.0"
7
+ }
1B_baseline/modeling_qwen3.py ADDED
@@ -0,0 +1,1539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch Qwen3 model."""
21
+
22
+ import math
23
+ from typing import List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
32
+ from transformers.generation import GenerationMixin
33
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
34
+ from transformers.modeling_outputs import (
35
+ BaseModelOutputWithPast,
36
+ CausalLMOutputWithPast,
37
+ QuestionAnsweringModelOutput,
38
+ SequenceClassifierOutputWithPast,
39
+ TokenClassifierOutput,
40
+ )
41
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
42
+ from transformers.modeling_utils import PreTrainedModel
43
+ from transformers.utils import (
44
+ add_code_sample_docstrings,
45
+ add_start_docstrings,
46
+ add_start_docstrings_to_model_forward,
47
+ is_flash_attn_2_available,
48
+ is_flash_attn_greater_or_equal_2_10,
49
+ logging,
50
+ replace_return_docstrings,
51
+ )
52
+ from .configuration_qwen3 import Qwen3Config
53
+
54
+ if is_flash_attn_2_available():
55
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "Qwen/Qwen3-8B"
60
+ _CONFIG_FOR_DOC = "Qwen3Config"
61
+
62
+
63
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen3
64
+ class Qwen3RMSNorm(nn.Module):
65
+ def __init__(self, hidden_size, eps=1e-6):
66
+ """
67
+ Qwen3RMSNorm is equivalent to T5LayerNorm
68
+ """
69
+ super().__init__()
70
+ self.weight = nn.Parameter(torch.ones(hidden_size))
71
+ self.variance_epsilon = eps
72
+
73
+ def forward(self, hidden_states):
74
+ input_dtype = hidden_states.dtype
75
+ hidden_states = hidden_states.to(torch.float32)
76
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
77
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
78
+ return self.weight * hidden_states.to(input_dtype)
79
+
80
+ def extra_repr(self):
81
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
82
+
83
+
84
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen3
85
+ class Qwen3RotaryEmbedding(nn.Module):
86
+ def __init__(
87
+ self,
88
+ dim=None,
89
+ max_position_embeddings=2048,
90
+ base=10000,
91
+ device=None,
92
+ scaling_factor=1.0,
93
+ rope_type="default",
94
+ config: Optional[Qwen3Config] = None,
95
+ ):
96
+ super().__init__()
97
+ # TODO (joao): remove the `if` below, only used for BC
98
+ self.rope_kwargs = {}
99
+ if config is None:
100
+ logger.warning_once(
101
+ "`Qwen3RotaryEmbedding` can now be fully parameterized by passing the model config through the "
102
+ "`config` argument. All other arguments will be removed in v4.46"
103
+ )
104
+ self.rope_kwargs = {
105
+ "rope_type": rope_type,
106
+ "factor": scaling_factor,
107
+ "dim": dim,
108
+ "base": base,
109
+ "max_position_embeddings": max_position_embeddings,
110
+ }
111
+ self.rope_type = rope_type
112
+ self.max_seq_len_cached = max_position_embeddings
113
+ self.original_max_seq_len = max_position_embeddings
114
+ else:
115
+ # BC: "rope_type" was originally "type"
116
+ if config.rope_scaling is not None:
117
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
118
+ else:
119
+ self.rope_type = "default"
120
+ self.max_seq_len_cached = config.max_position_embeddings
121
+ self.original_max_seq_len = config.max_position_embeddings
122
+
123
+ self.config = config
124
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
125
+
126
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
127
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
128
+ self.original_inv_freq = self.inv_freq
129
+
130
+ def _dynamic_frequency_update(self, position_ids, device):
131
+ """
132
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
133
+ 1 - growing beyond the cached sequence length (allow scaling)
134
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
135
+ """
136
+ seq_len = torch.max(position_ids) + 1
137
+ if seq_len > self.max_seq_len_cached: # growth
138
+ inv_freq, self.attention_scaling = self.rope_init_fn(
139
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
140
+ )
141
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
142
+ self.max_seq_len_cached = seq_len
143
+
144
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
145
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
146
+ self.max_seq_len_cached = self.original_max_seq_len
147
+
148
+ @torch.no_grad()
149
+ def forward(self, x, position_ids):
150
+ if "dynamic" in self.rope_type:
151
+ self._dynamic_frequency_update(position_ids, device=x.device)
152
+
153
+ # Core RoPE block
154
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
155
+ position_ids_expanded = position_ids[:, None, :].float()
156
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
157
+ device_type = x.device.type
158
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
159
+ with torch.autocast(device_type=device_type, enabled=False):
160
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
161
+ emb = torch.cat((freqs, freqs), dim=-1)
162
+ cos = emb.cos()
163
+ sin = emb.sin()
164
+
165
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
166
+ cos = cos * self.attention_scaling
167
+ sin = sin * self.attention_scaling
168
+
169
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
170
+
171
+
172
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
173
+ def rotate_half(x):
174
+ """Rotates half the hidden dims of the input."""
175
+ x1 = x[..., : x.shape[-1] // 2]
176
+ x2 = x[..., x.shape[-1] // 2:]
177
+ return torch.cat((-x2, x1), dim=-1)
178
+
179
+
180
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
181
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
182
+ """Applies Rotary Position Embedding to the query and key tensors.
183
+
184
+ Args:
185
+ q (`torch.Tensor`): The query tensor.
186
+ k (`torch.Tensor`): The key tensor.
187
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
188
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
189
+ position_ids (`torch.Tensor`, *optional*):
190
+ Deprecated and unused.
191
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
192
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
193
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
194
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
195
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
196
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
197
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
198
+ Returns:
199
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
200
+ """
201
+ cos = cos.unsqueeze(unsqueeze_dim)
202
+ sin = sin.unsqueeze(unsqueeze_dim)
203
+ q_embed = (q * cos) + (rotate_half(q) * sin)
204
+ k_embed = (k * cos) + (rotate_half(k) * sin)
205
+ return q_embed, k_embed
206
+
207
+
208
+ # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen3
209
+ class Qwen3MLP(nn.Module):
210
+ def __init__(self, config):
211
+ super().__init__()
212
+ self.hidden_size = config.hidden_size
213
+ self.intermediate_size = config.intermediate_size
214
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
215
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
216
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
217
+ self.act_fn = ACT2FN[config.hidden_act]
218
+
219
+ def forward(self, hidden_state):
220
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
221
+
222
+
223
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
224
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
225
+ """
226
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
227
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
228
+ """
229
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
230
+ if n_rep == 1:
231
+ return hidden_states
232
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
233
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
234
+
235
+
236
+ class Qwen3Attention(nn.Module):
237
+ """
238
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
239
+ and "Generating Long Sequences with Sparse Transformers".
240
+ """
241
+
242
+ def __init__(self, config: Qwen3Config, layer_idx: Optional[int] = None):
243
+ super().__init__()
244
+ self.config = config
245
+ self.layer_idx = layer_idx
246
+ if layer_idx is None:
247
+ logger.warning_once(
248
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
249
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
250
+ "when creating this class."
251
+ )
252
+
253
+ self.hidden_size = config.hidden_size
254
+ self.num_heads = config.num_attention_heads
255
+ self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
256
+ self.num_key_value_heads = config.num_key_value_heads
257
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
258
+ self.max_position_embeddings = config.max_position_embeddings
259
+ self.rope_theta = config.rope_theta
260
+ self.is_causal = True
261
+ self.attention_dropout = config.attention_dropout
262
+ self.use_qk_norm = config.use_qk_norm
263
+ self.headwise_attn_output_gate = config.headwise_attn_output_gate
264
+ self.elementwise_attn_output_gate = config.elementwise_attn_output_gate
265
+
266
+ # if (self.head_dim * self.num_heads) != self.hidden_size:
267
+ # raise ValueError(
268
+ # f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
269
+ # f" and `num_heads`: {self.num_heads})."
270
+ # )
271
+ if self.headwise_attn_output_gate:
272
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim + self.num_heads, bias=config.qkv_bias)
273
+ elif self.elementwise_attn_output_gate:
274
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim * 2, bias=config.qkv_bias)
275
+ else:
276
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.qkv_bias)
277
+
278
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
279
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
280
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.qkv_bias)
281
+ if self.use_qk_norm:
282
+ self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
283
+ self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
284
+
285
+ self.rotary_emb = Qwen3RotaryEmbedding(config=self.config)
286
+
287
+ def forward(
288
+ self,
289
+ hidden_states: torch.Tensor,
290
+ attention_mask: Optional[torch.Tensor] = None,
291
+ position_ids: Optional[torch.LongTensor] = None,
292
+ past_key_value: Optional[Cache] = None,
293
+ output_attentions: bool = False,
294
+ use_cache: bool = False,
295
+ cache_position: Optional[torch.LongTensor] = None,
296
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
297
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
298
+ bsz, q_len, _ = hidden_states.size()
299
+
300
+ query_states = self.q_proj(hidden_states)
301
+ key_states = self.k_proj(hidden_states)
302
+ value_states = self.v_proj(hidden_states)
303
+
304
+ if self.headwise_attn_output_gate:
305
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
306
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
307
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
308
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
309
+ elif self.elementwise_attn_output_gate:
310
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
311
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
312
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
313
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
314
+ else:
315
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
316
+
317
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
318
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
319
+
320
+
321
+ if self.use_qk_norm:
322
+ query_states = self.q_norm(query_states)
323
+ key_states = self.k_norm(key_states)
324
+
325
+ cos, sin = position_embeddings
326
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
327
+
328
+ if past_key_value is not None:
329
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
330
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
331
+
332
+ # repeat k/v heads if n_kv_heads < n_heads
333
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
334
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
335
+
336
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
337
+ if attention_mask is not None: # no matter the length, we just slice it
338
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
339
+ attn_weights = attn_weights + causal_mask
340
+
341
+ # upcast attention to fp32
342
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
343
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
344
+
345
+ attn_output = torch.matmul(attn_weights, value_states)
346
+
347
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
348
+ raise ValueError(
349
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
350
+ f" {attn_output.size()}"
351
+ )
352
+
353
+ attn_output = attn_output.transpose(1, 2).contiguous()
354
+
355
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
356
+ attn_output = attn_output * torch.sigmoid(gate_score)
357
+
358
+ attn_output = attn_output.reshape(bsz, q_len, -1)
359
+
360
+ attn_output = self.o_proj(attn_output)
361
+
362
+ if not output_attentions:
363
+ attn_weights = None
364
+
365
+ return attn_output, attn_weights, past_key_value
366
+
367
+
368
+ class Qwen3FlashAttention2(Qwen3Attention):
369
+ """
370
+ Qwen3 flash attention module, following Qwen3 attention module. This module inherits from `Qwen3Attention`
371
+ as the weights of the module stays untouched. The only required change would be on the forward pass
372
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
373
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
374
+ config.max_window_layers layers.
375
+ """
376
+
377
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
378
+ def __init__(self, *args, **kwargs):
379
+ super().__init__(*args, **kwargs)
380
+
381
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
382
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
383
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
384
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
385
+
386
+ def forward(
387
+ self,
388
+ hidden_states: torch.Tensor,
389
+ attention_mask: Optional[torch.Tensor] = None,
390
+ position_ids: Optional[torch.LongTensor] = None,
391
+ past_key_value: Optional[Cache] = None,
392
+ output_attentions: bool = False,
393
+ use_cache: bool = False,
394
+ cache_position: Optional[torch.LongTensor] = None,
395
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
396
+ ):
397
+ bsz, q_len, _ = hidden_states.size()
398
+
399
+ query_states = self.q_proj(hidden_states)
400
+ key_states = self.k_proj(hidden_states)
401
+ value_states = self.v_proj(hidden_states)
402
+
403
+ if self.headwise_attn_output_gate:
404
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
405
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
406
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
407
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
408
+ elif self.elementwise_attn_output_gate:
409
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
410
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
411
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
412
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
413
+ else:
414
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
415
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
416
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
417
+
418
+ if self.use_qk_norm:
419
+ query_states = self.q_norm(query_states)
420
+ key_states = self.k_norm(key_states)
421
+
422
+ cos, sin = position_embeddings
423
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
424
+
425
+ if past_key_value is not None:
426
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
427
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
428
+
429
+ # repeat k/v heads if n_kv_heads < n_heads
430
+ # key_states = repeat_kv(key_states, self.num_key_value_groups)
431
+ # value_states = repeat_kv(value_states, self.num_key_value_groups)
432
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
433
+
434
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
435
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
436
+ # cast them back in float16 just to be sure everything works as expected.
437
+ input_dtype = query_states.dtype
438
+ if input_dtype == torch.float32:
439
+ if torch.is_autocast_enabled():
440
+ target_dtype = torch.get_autocast_gpu_dtype()
441
+ # Handle the case where the model is quantized
442
+ elif hasattr(self.config, "_pre_quantization_dtype"):
443
+ target_dtype = self.config._pre_quantization_dtype
444
+ else:
445
+ target_dtype = self.q_proj.weight.dtype
446
+
447
+ logger.warning_once(
448
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
449
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
450
+ f" {target_dtype}."
451
+ )
452
+
453
+ query_states = query_states.to(target_dtype)
454
+ key_states = key_states.to(target_dtype)
455
+ value_states = value_states.to(target_dtype)
456
+
457
+ # Reashape to the expected shape for Flash Attention
458
+ query_states = query_states.transpose(1, 2)
459
+ key_states = key_states.transpose(1, 2)
460
+ value_states = value_states.transpose(1, 2)
461
+
462
+ if (
463
+ self.config.use_sliding_window
464
+ and getattr(self.config, "sliding_window", None) is not None
465
+ and self.layer_idx >= self.config.max_window_layers
466
+ ):
467
+ sliding_window = self.config.sliding_window
468
+ else:
469
+ sliding_window = None
470
+ attn_output = _flash_attention_forward(
471
+ query_states,
472
+ key_states,
473
+ value_states,
474
+ attention_mask,
475
+ q_len,
476
+ position_ids=position_ids,
477
+ dropout=dropout_rate,
478
+ sliding_window=sliding_window,
479
+ is_causal=self.is_causal,
480
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
481
+ )
482
+
483
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
484
+ attn_output = attn_output * torch.sigmoid(gate_score)
485
+
486
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
487
+ attn_output = self.o_proj(attn_output)
488
+ if not output_attentions:
489
+ attn_weights = None
490
+
491
+ return attn_output, attn_weights, past_key_value
492
+
493
+
494
+
495
+ class Qwen3SdpaAttention(Qwen3Attention):
496
+ """
497
+ Qwen3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
498
+ `Qwen3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
499
+ SDPA API.
500
+ """
501
+
502
+ # Adapted from Qwen3Attention.forward
503
+ def forward(
504
+ self,
505
+ hidden_states: torch.Tensor,
506
+ attention_mask: Optional[torch.Tensor] = None,
507
+ position_ids: Optional[torch.LongTensor] = None,
508
+ past_key_value: Optional[Cache] = None,
509
+ output_attentions: bool = False,
510
+ use_cache: bool = False,
511
+ cache_position: Optional[torch.LongTensor] = None,
512
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
513
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
514
+ if output_attentions:
515
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
516
+ logger.warning_once(
517
+ "Qwen3Model is using Qwen3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
518
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
519
+ )
520
+ return super().forward(
521
+ hidden_states=hidden_states,
522
+ attention_mask=attention_mask,
523
+ position_ids=position_ids,
524
+ past_key_value=past_key_value,
525
+ output_attentions=output_attentions,
526
+ use_cache=use_cache,
527
+ cache_position=cache_position,
528
+ position_embeddings=position_embeddings,
529
+ )
530
+
531
+ bsz, q_len, _ = hidden_states.size()
532
+
533
+ query_states = self.q_proj(hidden_states)
534
+ key_states = self.k_proj(hidden_states)
535
+ value_states = self.v_proj(hidden_states)
536
+
537
+ if self.headwise_attn_output_gate:
538
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
539
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
540
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
541
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
542
+ elif self.elementwise_attn_output_gate:
543
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
544
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
545
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
546
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
547
+ else:
548
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
549
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
550
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
551
+
552
+ if self.use_qk_norm:
553
+ query_states = self.q_norm(query_states)
554
+ key_states = self.k_norm(key_states)
555
+
556
+ cos, sin = position_embeddings
557
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
558
+
559
+ if past_key_value is not None:
560
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
561
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
562
+
563
+ # key_states: bs, head, q_len, head_dim
564
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
565
+
566
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
567
+
568
+ causal_mask = attention_mask
569
+ if attention_mask is not None: # no matter the length, we just slice it
570
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
571
+
572
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
573
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
574
+ if query_states.device.type == "cuda" and attention_mask is not None:
575
+ query_states = query_states.contiguous()
576
+ key_states = key_states.contiguous()
577
+ value_states = value_states.contiguous()
578
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
579
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
580
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
581
+ is_causal = True if causal_mask is None and q_len > 1 else False
582
+
583
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
584
+ query_states,
585
+ key_states,
586
+ value_states,
587
+ attn_mask=causal_mask,
588
+ dropout_p=self.attention_dropout if self.training else 0.0,
589
+ is_causal=is_causal,
590
+ )
591
+
592
+ attn_output = attn_output.transpose(1, 2).contiguous()
593
+
594
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
595
+ attn_output = attn_output * torch.sigmoid(gate_score)
596
+
597
+ attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
598
+
599
+ attn_output = self.o_proj(attn_output)
600
+
601
+ return attn_output, None, past_key_value
602
+
603
+ QWEN3_ATTENTION_CLASSES = {
604
+ "eager": Qwen3Attention,
605
+ "flash_attention_2": Qwen3FlashAttention2,
606
+ "sdpa": Qwen3SdpaAttention,
607
+ }
608
+
609
+
610
+ class Qwen3DecoderLayer(nn.Module):
611
+ def __init__(self, config: Qwen3Config, layer_idx: int):
612
+ super().__init__()
613
+ self.hidden_size = config.hidden_size
614
+
615
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
616
+ logger.warning_once(
617
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
618
+ "unexpected results may be encountered."
619
+ )
620
+ self.self_attn = QWEN3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
621
+
622
+ self.mlp = Qwen3MLP(config)
623
+ self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
624
+ self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
625
+
626
+ def forward(
627
+ self,
628
+ hidden_states: torch.Tensor,
629
+ attention_mask: Optional[torch.Tensor] = None,
630
+ position_ids: Optional[torch.LongTensor] = None,
631
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
632
+ output_attentions: Optional[bool] = False,
633
+ use_cache: Optional[bool] = False,
634
+ cache_position: Optional[torch.LongTensor] = None,
635
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
636
+ **kwargs,
637
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
638
+ """
639
+ Args:
640
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
641
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
642
+ `(batch, sequence_length)` where padding elements are indicated by 0.
643
+ output_attentions (`bool`, *optional*):
644
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
645
+ returned tensors for more detail.
646
+ use_cache (`bool`, *optional*):
647
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
648
+ (see `past_key_values`).
649
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
650
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
651
+ Indices depicting the position of the input sequence tokens in the sequence.
652
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
653
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
654
+ with `head_dim` being the embedding dimension of each attention head.
655
+ kwargs (`dict`, *optional*):
656
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
657
+ into the model
658
+ """
659
+
660
+ residual = hidden_states
661
+
662
+ hidden_states = self.input_layernorm(hidden_states)
663
+
664
+ # Self Attention
665
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
666
+ hidden_states=hidden_states,
667
+ attention_mask=attention_mask,
668
+ position_ids=position_ids,
669
+ past_key_value=past_key_value,
670
+ output_attentions=output_attentions,
671
+ use_cache=use_cache,
672
+ cache_position=cache_position,
673
+ position_embeddings=position_embeddings,
674
+ )
675
+ hidden_states = residual + hidden_states
676
+
677
+ # Fully Connected
678
+ residual = hidden_states
679
+ hidden_states = self.post_attention_layernorm(hidden_states)
680
+ hidden_states = self.mlp(hidden_states)
681
+ hidden_states = residual + hidden_states
682
+
683
+ outputs = (hidden_states,)
684
+
685
+ if output_attentions:
686
+ outputs += (self_attn_weights,)
687
+
688
+ if use_cache:
689
+ outputs += (present_key_value,)
690
+
691
+ return outputs
692
+
693
+
694
+ QWEN3_START_DOCSTRING = r"""
695
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
696
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
697
+ etc.)
698
+
699
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
700
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
701
+ and behavior.
702
+
703
+ Parameters:
704
+ config ([`Qwen3Config`]):
705
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
706
+ load the weights associated with the model, only the configuration. Check out the
707
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
708
+ """
709
+
710
+
711
+ @add_start_docstrings(
712
+ "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
713
+ QWEN3_START_DOCSTRING,
714
+ )
715
+ class Qwen3PreTrainedModel(PreTrainedModel):
716
+ config_class = Qwen3Config
717
+ base_model_prefix = "model"
718
+ supports_gradient_checkpointing = True
719
+ _no_split_modules = ["Qwen3DecoderLayer"]
720
+ _skip_keys_device_placement = "past_key_values"
721
+ _supports_flash_attn_2 = True
722
+ _supports_sdpa = True
723
+ _supports_cache_class = True
724
+ _supports_quantized_cache = True
725
+ _supports_static_cache = True
726
+
727
+ def _init_weights(self, module):
728
+ std = self.config.initializer_range
729
+ if isinstance(module, nn.Linear):
730
+ module.weight.data.normal_(mean=0.0, std=std)
731
+ if module.bias is not None:
732
+ module.bias.data.zero_()
733
+ elif isinstance(module, nn.Embedding):
734
+ module.weight.data.normal_(mean=0.0, std=std)
735
+ if module.padding_idx is not None:
736
+ module.weight.data[module.padding_idx].zero_()
737
+
738
+
739
+ QWEN3_INPUTS_DOCSTRING = r"""
740
+ Args:
741
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
742
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
743
+ it.
744
+
745
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
746
+ [`PreTrainedTokenizer.__call__`] for details.
747
+
748
+ [What are input IDs?](../glossary#input-ids)
749
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
750
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
751
+
752
+ - 1 for tokens that are **not masked**,
753
+ - 0 for tokens that are **masked**.
754
+
755
+ [What are attention masks?](../glossary#attention-mask)
756
+
757
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
758
+ [`PreTrainedTokenizer.__call__`] for details.
759
+
760
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
761
+ `past_key_values`).
762
+
763
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
764
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
765
+ information on the default strategy.
766
+
767
+ - 1 indicates the head is **not masked**,
768
+ - 0 indicates the head is **masked**.
769
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
770
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
771
+ config.n_positions - 1]`.
772
+
773
+ [What are position IDs?](../glossary#position-ids)
774
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
775
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
776
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
777
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
778
+
779
+ Two formats are allowed:
780
+ - a [`~cache_utils.Cache`] instance, see our
781
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
782
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
783
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
784
+ cache format.
785
+
786
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
787
+ legacy cache format will be returned.
788
+
789
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
790
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
791
+ of shape `(batch_size, sequence_length)`.
792
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
793
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
794
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
795
+ model's internal embedding lookup matrix.
796
+ use_cache (`bool`, *optional*):
797
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
798
+ `past_key_values`).
799
+ output_attentions (`bool`, *optional*):
800
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
801
+ tensors for more detail.
802
+ output_hidden_states (`bool`, *optional*):
803
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
804
+ more detail.
805
+ return_dict (`bool`, *optional*):
806
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
807
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
808
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
809
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
810
+ the complete sequence length.
811
+ """
812
+
813
+
814
+ @add_start_docstrings(
815
+ "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
816
+ QWEN3_START_DOCSTRING,
817
+ )
818
+ class Qwen3Model(Qwen3PreTrainedModel):
819
+ """
820
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
821
+
822
+ Args:
823
+ config: Qwen3Config
824
+ """
825
+
826
+ def __init__(self, config: Qwen3Config):
827
+ super().__init__(config)
828
+ self.padding_idx = config.pad_token_id
829
+ self.vocab_size = config.vocab_size
830
+
831
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
832
+ self.layers = nn.ModuleList(
833
+ [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
834
+ )
835
+ self._attn_implementation = config._attn_implementation
836
+ self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
837
+ self.rotary_emb = Qwen3RotaryEmbedding(config=config)
838
+
839
+ self.gradient_checkpointing = False
840
+ # Initialize weights and apply final processing
841
+ self.post_init()
842
+
843
+ def get_input_embeddings(self):
844
+ return self.embed_tokens
845
+
846
+ def set_input_embeddings(self, value):
847
+ self.embed_tokens = value
848
+
849
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
850
+ def forward(
851
+ self,
852
+ input_ids: torch.LongTensor = None,
853
+ attention_mask: Optional[torch.Tensor] = None,
854
+ position_ids: Optional[torch.LongTensor] = None,
855
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
856
+ inputs_embeds: Optional[torch.FloatTensor] = None,
857
+ use_cache: Optional[bool] = None,
858
+ output_attentions: Optional[bool] = None,
859
+ output_hidden_states: Optional[bool] = None,
860
+ return_dict: Optional[bool] = None,
861
+ cache_position: Optional[torch.LongTensor] = None,
862
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
863
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
864
+ output_hidden_states = (
865
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
866
+ )
867
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
868
+
869
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
870
+
871
+ if (input_ids is None) ^ (inputs_embeds is not None):
872
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
873
+
874
+ if self.gradient_checkpointing and self.training:
875
+ if use_cache:
876
+ logger.warning_once(
877
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
878
+ )
879
+ use_cache = False
880
+
881
+ # kept for BC (non `Cache` `past_key_values` inputs)
882
+ return_legacy_cache = False
883
+ if use_cache and not isinstance(past_key_values, Cache):
884
+ return_legacy_cache = True
885
+ if past_key_values is None:
886
+ past_key_values = DynamicCache()
887
+ else:
888
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
889
+ logger.warning_once(
890
+ "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
891
+ "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
892
+ "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
893
+ )
894
+
895
+ if inputs_embeds is None:
896
+ inputs_embeds = self.embed_tokens(input_ids)
897
+
898
+ if cache_position is None:
899
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
900
+ cache_position = torch.arange(
901
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
902
+ )
903
+ if position_ids is None:
904
+ position_ids = cache_position.unsqueeze(0)
905
+
906
+ causal_mask = self._update_causal_mask(
907
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
908
+ )
909
+
910
+ hidden_states = inputs_embeds
911
+
912
+ # create position embeddings to be shared across the decoder layers
913
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
914
+
915
+ # decoder layers
916
+ all_hidden_states = () if output_hidden_states else None
917
+ all_self_attns = () if output_attentions else None
918
+ next_decoder_cache = None
919
+
920
+ for decoder_layer in self.layers:
921
+ if output_hidden_states:
922
+ all_hidden_states += (hidden_states,)
923
+
924
+ if self.gradient_checkpointing and self.training:
925
+ layer_outputs = self._gradient_checkpointing_func(
926
+ decoder_layer.__call__,
927
+ hidden_states,
928
+ causal_mask,
929
+ position_ids,
930
+ past_key_values,
931
+ output_attentions,
932
+ use_cache,
933
+ cache_position,
934
+ position_embeddings,
935
+ )
936
+ else:
937
+ layer_outputs = decoder_layer(
938
+ hidden_states,
939
+ attention_mask=causal_mask,
940
+ position_ids=position_ids,
941
+ past_key_value=past_key_values,
942
+ output_attentions=output_attentions,
943
+ use_cache=use_cache,
944
+ cache_position=cache_position,
945
+ position_embeddings=position_embeddings,
946
+ )
947
+
948
+ hidden_states = layer_outputs[0]
949
+
950
+ if use_cache:
951
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
952
+
953
+ if output_attentions:
954
+ all_self_attns += (layer_outputs[1],)
955
+
956
+ hidden_states = self.norm(hidden_states)
957
+
958
+ # add hidden states from the last decoder layer
959
+ if output_hidden_states:
960
+ all_hidden_states += (hidden_states,)
961
+
962
+ next_cache = next_decoder_cache if use_cache else None
963
+ if return_legacy_cache:
964
+ next_cache = next_cache.to_legacy_cache()
965
+
966
+ if not return_dict:
967
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
968
+ return BaseModelOutputWithPast(
969
+ last_hidden_state=hidden_states,
970
+ past_key_values=next_cache,
971
+ hidden_states=all_hidden_states,
972
+ attentions=all_self_attns,
973
+ )
974
+
975
+ # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
976
+ def _update_causal_mask(
977
+ self,
978
+ attention_mask: torch.Tensor,
979
+ input_tensor: torch.Tensor,
980
+ cache_position: torch.Tensor,
981
+ past_key_values: Cache,
982
+ output_attentions: bool,
983
+ ):
984
+ if self.config._attn_implementation == "flash_attention_2":
985
+ if attention_mask is not None and 0.0 in attention_mask:
986
+ return attention_mask
987
+ return None
988
+
989
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
990
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
991
+ # to infer the attention mask.
992
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
993
+ using_static_cache = isinstance(past_key_values, StaticCache)
994
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
995
+
996
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
997
+ if (
998
+ self.config._attn_implementation == "sdpa"
999
+ and not (using_static_cache or using_sliding_window_cache)
1000
+ and not output_attentions
1001
+ ):
1002
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1003
+ attention_mask,
1004
+ inputs_embeds=input_tensor,
1005
+ past_key_values_length=past_seen_tokens,
1006
+ sliding_window=self.config.sliding_window,
1007
+ is_training=self.training,
1008
+ ):
1009
+ return None
1010
+
1011
+ dtype, device = input_tensor.dtype, input_tensor.device
1012
+ min_dtype = torch.finfo(dtype).min
1013
+ sequence_length = input_tensor.shape[1]
1014
+ # SlidingWindowCache or StaticCache
1015
+ if using_sliding_window_cache or using_static_cache:
1016
+ target_length = past_key_values.get_max_cache_shape()
1017
+ # DynamicCache or no cache
1018
+ else:
1019
+ target_length = (
1020
+ attention_mask.shape[-1]
1021
+ if isinstance(attention_mask, torch.Tensor)
1022
+ else past_seen_tokens + sequence_length + 1
1023
+ )
1024
+
1025
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1026
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
1027
+ attention_mask,
1028
+ sequence_length=sequence_length,
1029
+ target_length=target_length,
1030
+ dtype=dtype,
1031
+ device=device,
1032
+ cache_position=cache_position,
1033
+ batch_size=input_tensor.shape[0],
1034
+ config=self.config,
1035
+ past_key_values=past_key_values,
1036
+ )
1037
+
1038
+ if (
1039
+ self.config._attn_implementation == "sdpa"
1040
+ and attention_mask is not None
1041
+ and attention_mask.device.type == "cuda"
1042
+ and not output_attentions
1043
+ ):
1044
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1045
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1046
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1047
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1048
+
1049
+ return causal_mask
1050
+
1051
+ @staticmethod
1052
+ # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen3
1053
+ def _prepare_4d_causal_attention_mask_with_cache_position(
1054
+ attention_mask: torch.Tensor,
1055
+ sequence_length: int,
1056
+ target_length: int,
1057
+ dtype: torch.dtype,
1058
+ device: torch.device,
1059
+ cache_position: torch.Tensor,
1060
+ batch_size: int,
1061
+ config: Qwen3Config,
1062
+ past_key_values: Cache,
1063
+ ):
1064
+ """
1065
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1066
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1067
+
1068
+ Args:
1069
+ attention_mask (`torch.Tensor`):
1070
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1071
+ sequence_length (`int`):
1072
+ The sequence length being processed.
1073
+ target_length (`int`):
1074
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1075
+ dtype (`torch.dtype`):
1076
+ The dtype to use for the 4D attention mask.
1077
+ device (`torch.device`):
1078
+ The device to plcae the 4D attention mask on.
1079
+ cache_position (`torch.Tensor`):
1080
+ Indices depicting the position of the input sequence tokens in the sequence.
1081
+ batch_size (`torch.Tensor`):
1082
+ Batch size.
1083
+ config (`Qwen3Config`):
1084
+ The model's configuration class
1085
+ past_key_values (`Cache`):
1086
+ The cache class that is being used currently to generate
1087
+ """
1088
+ if attention_mask is not None and attention_mask.dim() == 4:
1089
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1090
+ causal_mask = attention_mask
1091
+ else:
1092
+ min_dtype = torch.finfo(dtype).min
1093
+ causal_mask = torch.full(
1094
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1095
+ )
1096
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1097
+ if config.sliding_window is not None:
1098
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1099
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
1100
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1101
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
1102
+ cache_position.reshape(-1, 1) - config.sliding_window
1103
+ )
1104
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1105
+ causal_mask *= diagonal_attend_mask
1106
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
1107
+ if attention_mask is not None:
1108
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1109
+ if attention_mask.shape[-1] > target_length:
1110
+ attention_mask = attention_mask[:, :target_length]
1111
+ mask_length = attention_mask.shape[-1]
1112
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1113
+ padding_mask = padding_mask == 0
1114
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1115
+ padding_mask, min_dtype
1116
+ )
1117
+ return causal_mask
1118
+
1119
+
1120
+ class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
1121
+ _tied_weights_keys = ["lm_head.weight"]
1122
+ _tp_plan = {"lm_head": "colwise_rep"}
1123
+
1124
+ def __init__(self, config):
1125
+ super().__init__(config)
1126
+ self.model = Qwen3Model(config)
1127
+ self.vocab_size = config.vocab_size
1128
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1129
+
1130
+ # Initialize weights and apply final processing
1131
+ self.post_init()
1132
+
1133
+ def get_input_embeddings(self):
1134
+ return self.model.embed_tokens
1135
+
1136
+ def set_input_embeddings(self, value):
1137
+ self.model.embed_tokens = value
1138
+
1139
+ def get_output_embeddings(self):
1140
+ return self.lm_head
1141
+
1142
+ def set_output_embeddings(self, new_embeddings):
1143
+ self.lm_head = new_embeddings
1144
+
1145
+ def set_decoder(self, decoder):
1146
+ self.model = decoder
1147
+
1148
+ def get_decoder(self):
1149
+ return self.model
1150
+
1151
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1152
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1153
+ def forward(
1154
+ self,
1155
+ input_ids: torch.LongTensor = None,
1156
+ attention_mask: Optional[torch.Tensor] = None,
1157
+ position_ids: Optional[torch.LongTensor] = None,
1158
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1159
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1160
+ labels: Optional[torch.LongTensor] = None,
1161
+ use_cache: Optional[bool] = None,
1162
+ output_attentions: Optional[bool] = None,
1163
+ output_hidden_states: Optional[bool] = None,
1164
+ return_dict: Optional[bool] = None,
1165
+ cache_position: Optional[torch.LongTensor] = None,
1166
+ num_logits_to_keep: int = 0,
1167
+ **loss_kwargs,
1168
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1169
+ r"""
1170
+ Args:
1171
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1172
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1173
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1174
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1175
+
1176
+ num_logits_to_keep (`int`, *optional*):
1177
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1178
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1179
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1180
+
1181
+ Returns:
1182
+
1183
+ Example:
1184
+
1185
+ ```python
1186
+ >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
1187
+
1188
+ >>> model = Qwen3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1189
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1190
+
1191
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1192
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1193
+
1194
+ >>> # Generate
1195
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1196
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1197
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1198
+ ```"""
1199
+
1200
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1201
+ output_hidden_states = (
1202
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1203
+ )
1204
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1205
+
1206
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1207
+ outputs = self.model(
1208
+ input_ids=input_ids,
1209
+ attention_mask=attention_mask,
1210
+ position_ids=position_ids,
1211
+ past_key_values=past_key_values,
1212
+ inputs_embeds=inputs_embeds,
1213
+ use_cache=use_cache,
1214
+ output_attentions=output_attentions,
1215
+ output_hidden_states=output_hidden_states,
1216
+ return_dict=return_dict,
1217
+ cache_position=cache_position,
1218
+ )
1219
+
1220
+ hidden_states = outputs[0]
1221
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1222
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
1223
+
1224
+ loss = None
1225
+ if labels is not None:
1226
+ loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
1227
+
1228
+ if not return_dict:
1229
+ output = (logits,) + outputs[1:]
1230
+ return (loss,) + output if loss is not None else output
1231
+
1232
+ return CausalLMOutputWithPast(
1233
+ loss=loss,
1234
+ logits=logits,
1235
+ past_key_values=outputs.past_key_values,
1236
+ hidden_states=outputs.hidden_states,
1237
+ attentions=outputs.attentions,
1238
+ )
1239
+
1240
+
1241
+ @add_start_docstrings(
1242
+ """
1243
+ The Qwen3 Model transformer with a sequence classification head on top (linear layer).
1244
+
1245
+ [`Qwen3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1246
+ (e.g. GPT-2) do.
1247
+
1248
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1249
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1250
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1251
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1252
+ each row of the batch).
1253
+ """,
1254
+ QWEN3_START_DOCSTRING,
1255
+ )
1256
+ class Qwen3ForSequenceClassification(Qwen3PreTrainedModel):
1257
+ def __init__(self, config):
1258
+ super().__init__(config)
1259
+ self.num_labels = config.num_labels
1260
+ self.model = Qwen3Model(config)
1261
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1262
+
1263
+ # Initialize weights and apply final processing
1264
+ self.post_init()
1265
+
1266
+ def get_input_embeddings(self):
1267
+ return self.model.embed_tokens
1268
+
1269
+ def set_input_embeddings(self, value):
1270
+ self.model.embed_tokens = value
1271
+
1272
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1273
+ def forward(
1274
+ self,
1275
+ input_ids: torch.LongTensor = None,
1276
+ attention_mask: Optional[torch.Tensor] = None,
1277
+ position_ids: Optional[torch.LongTensor] = None,
1278
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1279
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1280
+ labels: Optional[torch.LongTensor] = None,
1281
+ use_cache: Optional[bool] = None,
1282
+ output_attentions: Optional[bool] = None,
1283
+ output_hidden_states: Optional[bool] = None,
1284
+ return_dict: Optional[bool] = None,
1285
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1286
+ r"""
1287
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1288
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1289
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1290
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1291
+ """
1292
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1293
+
1294
+ transformer_outputs = self.model(
1295
+ input_ids,
1296
+ attention_mask=attention_mask,
1297
+ position_ids=position_ids,
1298
+ past_key_values=past_key_values,
1299
+ inputs_embeds=inputs_embeds,
1300
+ use_cache=use_cache,
1301
+ output_attentions=output_attentions,
1302
+ output_hidden_states=output_hidden_states,
1303
+ return_dict=return_dict,
1304
+ )
1305
+ hidden_states = transformer_outputs[0]
1306
+ logits = self.score(hidden_states)
1307
+
1308
+ if input_ids is not None:
1309
+ batch_size = input_ids.shape[0]
1310
+ else:
1311
+ batch_size = inputs_embeds.shape[0]
1312
+
1313
+ if self.config.pad_token_id is None and batch_size != 1:
1314
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1315
+ if self.config.pad_token_id is None:
1316
+ sequence_lengths = -1
1317
+ else:
1318
+ if input_ids is not None:
1319
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1320
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1321
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1322
+ sequence_lengths = sequence_lengths.to(logits.device)
1323
+ else:
1324
+ sequence_lengths = -1
1325
+
1326
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1327
+
1328
+ loss = None
1329
+ if labels is not None:
1330
+ labels = labels.to(logits.device)
1331
+ if self.config.problem_type is None:
1332
+ if self.num_labels == 1:
1333
+ self.config.problem_type = "regression"
1334
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1335
+ self.config.problem_type = "single_label_classification"
1336
+ else:
1337
+ self.config.problem_type = "multi_label_classification"
1338
+
1339
+ if self.config.problem_type == "regression":
1340
+ loss_fct = MSELoss()
1341
+ if self.num_labels == 1:
1342
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1343
+ else:
1344
+ loss = loss_fct(pooled_logits, labels)
1345
+ elif self.config.problem_type == "single_label_classification":
1346
+ loss_fct = CrossEntropyLoss()
1347
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1348
+ elif self.config.problem_type == "multi_label_classification":
1349
+ loss_fct = BCEWithLogitsLoss()
1350
+ loss = loss_fct(pooled_logits, labels)
1351
+ if not return_dict:
1352
+ output = (pooled_logits,) + transformer_outputs[1:]
1353
+ return ((loss,) + output) if loss is not None else output
1354
+
1355
+ return SequenceClassifierOutputWithPast(
1356
+ loss=loss,
1357
+ logits=pooled_logits,
1358
+ past_key_values=transformer_outputs.past_key_values,
1359
+ hidden_states=transformer_outputs.hidden_states,
1360
+ attentions=transformer_outputs.attentions,
1361
+ )
1362
+
1363
+
1364
+ @add_start_docstrings(
1365
+ """
1366
+ The Qwen3 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1367
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1368
+ """,
1369
+ QWEN3_START_DOCSTRING,
1370
+ )
1371
+ # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen3, LLAMA->QWEN3
1372
+ class Qwen3ForTokenClassification(Qwen3PreTrainedModel):
1373
+ def __init__(self, config):
1374
+ super().__init__(config)
1375
+ self.num_labels = config.num_labels
1376
+ self.model = Qwen3Model(config)
1377
+ if getattr(config, "classifier_dropout", None) is not None:
1378
+ classifier_dropout = config.classifier_dropout
1379
+ elif getattr(config, "hidden_dropout", None) is not None:
1380
+ classifier_dropout = config.hidden_dropout
1381
+ else:
1382
+ classifier_dropout = 0.1
1383
+ self.dropout = nn.Dropout(classifier_dropout)
1384
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1385
+
1386
+ # Initialize weights and apply final processing
1387
+ self.post_init()
1388
+
1389
+ def get_input_embeddings(self):
1390
+ return self.model.embed_tokens
1391
+
1392
+ def set_input_embeddings(self, value):
1393
+ self.model.embed_tokens = value
1394
+
1395
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1396
+ @add_code_sample_docstrings(
1397
+ checkpoint=_CHECKPOINT_FOR_DOC,
1398
+ output_type=TokenClassifierOutput,
1399
+ config_class=_CONFIG_FOR_DOC,
1400
+ )
1401
+ def forward(
1402
+ self,
1403
+ input_ids: Optional[torch.LongTensor] = None,
1404
+ attention_mask: Optional[torch.Tensor] = None,
1405
+ position_ids: Optional[torch.LongTensor] = None,
1406
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1407
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1408
+ labels: Optional[torch.LongTensor] = None,
1409
+ use_cache: Optional[bool] = None,
1410
+ output_attentions: Optional[bool] = None,
1411
+ output_hidden_states: Optional[bool] = None,
1412
+ return_dict: Optional[bool] = None,
1413
+ ) -> Union[Tuple, TokenClassifierOutput]:
1414
+ r"""
1415
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1416
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1417
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1418
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1419
+ """
1420
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1421
+
1422
+ outputs = self.model(
1423
+ input_ids,
1424
+ attention_mask=attention_mask,
1425
+ position_ids=position_ids,
1426
+ past_key_values=past_key_values,
1427
+ inputs_embeds=inputs_embeds,
1428
+ use_cache=use_cache,
1429
+ output_attentions=output_attentions,
1430
+ output_hidden_states=output_hidden_states,
1431
+ return_dict=return_dict,
1432
+ )
1433
+ sequence_output = outputs[0]
1434
+ sequence_output = self.dropout(sequence_output)
1435
+ logits = self.score(sequence_output)
1436
+
1437
+ loss = None
1438
+ if labels is not None:
1439
+ loss = self.loss_function(logits, labels, self.config)
1440
+
1441
+ if not return_dict:
1442
+ output = (logits,) + outputs[2:]
1443
+ return ((loss,) + output) if loss is not None else output
1444
+
1445
+ return TokenClassifierOutput(
1446
+ loss=loss,
1447
+ logits=logits,
1448
+ hidden_states=outputs.hidden_states,
1449
+ attentions=outputs.attentions,
1450
+ )
1451
+
1452
+
1453
+ @add_start_docstrings(
1454
+ """
1455
+ The Qwen3 Model transformer with a span classification head on top for extractive question-answering tasks like
1456
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1457
+ """,
1458
+ QWEN3_START_DOCSTRING,
1459
+ )
1460
+ # Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen3, MISTRAL->QWEN3
1461
+ class Qwen3ForQuestionAnswering(Qwen3PreTrainedModel):
1462
+ base_model_prefix = "model"
1463
+
1464
+ # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Qwen3
1465
+ def __init__(self, config):
1466
+ super().__init__(config)
1467
+ self.model = Qwen3Model(config)
1468
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1469
+
1470
+ # Initialize weights and apply final processing
1471
+ self.post_init()
1472
+
1473
+ def get_input_embeddings(self):
1474
+ return self.model.embed_tokens
1475
+
1476
+ def set_input_embeddings(self, value):
1477
+ self.model.embed_tokens = value
1478
+
1479
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1480
+ def forward(
1481
+ self,
1482
+ input_ids: Optional[torch.LongTensor] = None,
1483
+ attention_mask: Optional[torch.FloatTensor] = None,
1484
+ position_ids: Optional[torch.LongTensor] = None,
1485
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1486
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1487
+ start_positions: Optional[torch.LongTensor] = None,
1488
+ end_positions: Optional[torch.LongTensor] = None,
1489
+ output_attentions: Optional[bool] = None,
1490
+ output_hidden_states: Optional[bool] = None,
1491
+ return_dict: Optional[bool] = None,
1492
+ **kwargs,
1493
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1494
+ r"""
1495
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1496
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1497
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1498
+ are not taken into account for computing the loss.
1499
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1500
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1501
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1502
+ are not taken into account for computing the loss.
1503
+ """
1504
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1505
+
1506
+ outputs = self.model(
1507
+ input_ids,
1508
+ attention_mask=attention_mask,
1509
+ position_ids=position_ids,
1510
+ past_key_values=past_key_values,
1511
+ inputs_embeds=inputs_embeds,
1512
+ output_attentions=output_attentions,
1513
+ output_hidden_states=output_hidden_states,
1514
+ return_dict=return_dict,
1515
+ )
1516
+
1517
+ sequence_output = outputs[0]
1518
+
1519
+ logits = self.qa_outputs(sequence_output)
1520
+ start_logits, end_logits = logits.split(1, dim=-1)
1521
+ start_logits = start_logits.squeeze(-1).contiguous()
1522
+ end_logits = end_logits.squeeze(-1).contiguous()
1523
+
1524
+ loss = None
1525
+ if start_positions is not None and end_positions is not None:
1526
+ loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
1527
+
1528
+ if not return_dict:
1529
+ output = (start_logits, end_logits) + outputs[2:]
1530
+ return ((loss,) + output) if loss is not None else output
1531
+
1532
+ return QuestionAnsweringModelOutput(
1533
+ loss=loss,
1534
+ start_logits=start_logits,
1535
+ end_logits=end_logits,
1536
+ hidden_states=outputs.hidden_states,
1537
+ attentions=outputs.attentions,
1538
+ )
1539
+
1B_baseline/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db6bba7e1e51674c34691a7daceb7c69fcc6b1145c74ecc85d5fc6e495cff970
3
+ size 3441774370
1B_baseline/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
1B_baseline/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|object_ref_start|>",
185
+ "<|object_ref_end|>",
186
+ "<|box_start|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_start|>",
191
+ "<|vision_end|>",
192
+ "<|vision_pad|>",
193
+ "<|image_pad|>",
194
+ "<|video_pad|>"
195
+ ],
196
+ "bos_token": null,
197
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "model_max_length": 131072,
202
+ "pad_token": "<|endoftext|>",
203
+ "split_special_tokens": false,
204
+ "tokenizer_class": "Qwen2Tokenizer",
205
+ "unk_token": null,
206
+ "add_bos_token": false
207
+ }
1B_baseline/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
1B_gate_elementwise/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_qwen3.Qwen3Config",
7
+ "AutoModel": "modeling_qwen3.Qwen3Model",
8
+ "AutoModelForCausalLM": "modeling_qwen3.Qwen3ForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "elementwise_attn_output_gate": true,
12
+ "headwise_attn_output_gate": false,
13
+ "bos_token_id": 151643,
14
+ "eos_token_id": 151643,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 5504,
19
+ "max_position_embeddings": 32768,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen2",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 28,
24
+ "num_key_value_heads": 8,
25
+ "qkv_bias": false,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": null,
28
+ "rope_theta": 1000000,
29
+ "sliding_window": null,
30
+ "tie_word_embeddings": true,
31
+ "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.46.0",
33
+ "use_cache": true,
34
+ "use_qk_norm": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 152064
37
+ }
1B_gate_elementwise/configuration_qwen3.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Lopyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
10
+ #
11
+ # Licensed under the Apache License, Version 2.0 (the "License");
12
+ # you may not use this file except in compliance with the License.
13
+ # You may obtain a copy of the License at
14
+ #
15
+ # http://www.apache.org/licenses/LICENSE-2.0
16
+ #
17
+ # Unless required by applicable law or agreed to in writing, software
18
+ # distributed under the License is distributed on an "AS IS" BASIS,
19
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
+ # See the License for the specific language governing permissions and
21
+ # limitations under the License.
22
+ """Qwen3 model configuration"""
23
+
24
+ from transformers.configuration_utils import PretrainedConfig
25
+ from transformers.modeling_rope_utils import rope_config_validation
26
+ from transformers.utils import logging
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class Qwen3Config(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
34
+ Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
35
+ with the defaults will yield a similar configuration to that of
36
+ Qwen3-8B-beta [Qwen/Qwen3-8B-beta](https://huggingface.co/Qwen/Qwen3-8B-beta).
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+
42
+ Args:
43
+ vocab_size (`int`, *optional*, defaults to 151936):
44
+ Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
45
+ `inputs_ids` passed when calling [`Qwen3Model`]
46
+ hidden_size (`int`, *optional*, defaults to 4096):
47
+ Dimension of the hidden representations.
48
+ intermediate_size (`int`, *optional*, defaults to 22016):
49
+ Dimension of the MLP representations.
50
+ num_hidden_layers (`int`, *optional*, defaults to 32):
51
+ Number of hidden layers in the Transformer encoder.
52
+ num_attention_heads (`int`, *optional*, defaults to 32):
53
+ Number of attention heads for each attention layer in the Transformer encoder.
54
+ num_key_value_heads (`int`, *optional*, defaults to 32):
55
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
56
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
57
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
58
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
59
+ by meanpooling all the original heads within that group. For more details checkout [this
60
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
+ The non-linear activation function (function or string) in the decoder.
63
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
64
+ The maximum sequence length that this model might ever be used with.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
73
+ Whether the model's input and output word embeddings should be tied.
74
+ rope_theta (`float`, *optional*, defaults to 10000.0):
75
+ The base period of the RoPE embeddings.
76
+ rope_scaling (`Dict`, *optional*):
77
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
78
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
79
+ accordingly.
80
+ Expected contents:
81
+ `rope_type` (`str`):
82
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
83
+ 'llama3'], with 'default' being the original RoPE implementation.
84
+ `factor` (`float`, *optional*):
85
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
86
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
87
+ original maximum pre-trained length.
88
+ `original_max_position_embeddings` (`int`, *optional*):
89
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
90
+ pretraining.
91
+ `attention_factor` (`float`, *optional*):
92
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
93
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
94
+ `factor` field to infer the suggested value.
95
+ `beta_fast` (`float`, *optional*):
96
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
97
+ ramp function. If unspecified, it defaults to 32.
98
+ `beta_slow` (`float`, *optional*):
99
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
100
+ ramp function. If unspecified, it defaults to 1.
101
+ `short_factor` (`List[float]`, *optional*):
102
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
103
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
104
+ size divided by the number of attention heads divided by 2
105
+ `long_factor` (`List[float]`, *optional*):
106
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
107
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
108
+ size divided by the number of attention heads divided by 2
109
+ `low_freq_factor` (`float`, *optional*):
110
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
111
+ `high_freq_factor` (`float`, *optional*):
112
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
113
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
114
+ Whether to use sliding window attention.
115
+ sliding_window (`int`, *optional*, defaults to 4096):
116
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
117
+ max_window_layers (`int`, *optional*, defaults to 28):
118
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
119
+ attention_bias (`bool`, *optional*, defaults to `False`):
120
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
121
+ attention_dropout (`float`, *optional*, defaults to 0.0):
122
+ The dropout ratio for the attention probabilities.
123
+ use_qk_norm (`bool`, *optional*, defaults to `False`):
124
+ Whether query and key in attention use norm
125
+ ```python
126
+ >>> from transformers import Qwen3Model, Qwen3Config
127
+
128
+ >>> # Initializing a Qwen3 style configuration
129
+ >>> configuration = Qwen3Config()
130
+
131
+ >>> # Initializing a model from the Qwen3-8B style configuration
132
+ >>> model = Qwen3Model(configuration)
133
+
134
+ >>> # Accessing the model configuration
135
+ >>> configuration = model.config
136
+ ```"""
137
+
138
+ model_type = "qwen3"
139
+ keys_to_ignore_at_inference = ["past_key_values"]
140
+
141
+ # Default tensor parallel plan for base model `Qwen3`
142
+ base_model_tp_plan = {
143
+ "layers.*.self_attn.q_proj": "colwise",
144
+ "layers.*.self_attn.k_proj": "colwise",
145
+ "layers.*.self_attn.v_proj": "colwise",
146
+ "layers.*.self_attn.o_proj": "rowwise",
147
+ "layers.*.mlp.gate_proj": "colwise",
148
+ "layers.*.mlp.up_proj": "colwise",
149
+ "layers.*.mlp.down_proj": "rowwise",
150
+ }
151
+
152
+ def __init__(
153
+ self,
154
+ vocab_size=151936,
155
+ hidden_size=4096,
156
+ intermediate_size=22016,
157
+ num_hidden_layers=32,
158
+ num_attention_heads=32,
159
+ num_key_value_heads=32,
160
+ head_dim=128,
161
+ hidden_act="silu",
162
+ max_position_embeddings=32768,
163
+ initializer_range=0.02,
164
+ rms_norm_eps=1e-6,
165
+ use_cache=True,
166
+ tie_word_embeddings=False,
167
+ rope_theta=10000.0,
168
+ rope_scaling=None,
169
+ use_sliding_window=False,
170
+ sliding_window=4096,
171
+ max_window_layers=28,
172
+ attention_bias=False,
173
+ attention_dropout=0.0,
174
+ use_qk_norm=True,
175
+ elementwise_attn_output_gate=False,
176
+ headwise_attn_output_gate=False,
177
+ **kwargs,
178
+ ):
179
+ self.vocab_size = vocab_size
180
+ self.max_position_embeddings = max_position_embeddings
181
+ self.hidden_size = hidden_size
182
+ self.intermediate_size = intermediate_size
183
+ self.num_hidden_layers = num_hidden_layers
184
+ self.num_attention_heads = num_attention_heads
185
+ self.head_dim = head_dim
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window if use_sliding_window else None
188
+ self.max_window_layers = max_window_layers
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.hidden_act = hidden_act
196
+ self.initializer_range = initializer_range
197
+ self.rms_norm_eps = rms_norm_eps
198
+ self.use_cache = use_cache
199
+ self.rope_theta = rope_theta
200
+ self.rope_scaling = rope_scaling
201
+ self.attention_bias = attention_bias
202
+ self.attention_dropout = attention_dropout
203
+ self.use_qk_norm = use_qk_norm
204
+
205
+ self.headwise_attn_output_gate = headwise_attn_output_gate
206
+ self.elementwise_attn_output_gate = elementwise_attn_output_gate
207
+
208
+ # Validate the correctness of rotary position embeddings parameters
209
+ # BC: if there is a 'type' field, move it to 'rope_type'.
210
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
211
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
212
+ rope_config_validation(self)
213
+
214
+ super().__init__(
215
+ tie_word_embeddings=tie_word_embeddings,
216
+ **kwargs,
217
+ )
218
+
1B_gate_elementwise/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": false,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.37.0"
7
+ }
1B_gate_elementwise/modeling_qwen3.py ADDED
@@ -0,0 +1,1539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch Qwen3 model."""
21
+
22
+ import math
23
+ from typing import List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
32
+ from transformers.generation import GenerationMixin
33
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
34
+ from transformers.modeling_outputs import (
35
+ BaseModelOutputWithPast,
36
+ CausalLMOutputWithPast,
37
+ QuestionAnsweringModelOutput,
38
+ SequenceClassifierOutputWithPast,
39
+ TokenClassifierOutput,
40
+ )
41
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
42
+ from transformers.modeling_utils import PreTrainedModel
43
+ from transformers.utils import (
44
+ add_code_sample_docstrings,
45
+ add_start_docstrings,
46
+ add_start_docstrings_to_model_forward,
47
+ is_flash_attn_2_available,
48
+ is_flash_attn_greater_or_equal_2_10,
49
+ logging,
50
+ replace_return_docstrings,
51
+ )
52
+ from .configuration_qwen3 import Qwen3Config
53
+
54
+ if is_flash_attn_2_available():
55
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "Qwen/Qwen3-8B"
60
+ _CONFIG_FOR_DOC = "Qwen3Config"
61
+
62
+
63
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen3
64
+ class Qwen3RMSNorm(nn.Module):
65
+ def __init__(self, hidden_size, eps=1e-6):
66
+ """
67
+ Qwen3RMSNorm is equivalent to T5LayerNorm
68
+ """
69
+ super().__init__()
70
+ self.weight = nn.Parameter(torch.ones(hidden_size))
71
+ self.variance_epsilon = eps
72
+
73
+ def forward(self, hidden_states):
74
+ input_dtype = hidden_states.dtype
75
+ hidden_states = hidden_states.to(torch.float32)
76
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
77
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
78
+ return self.weight * hidden_states.to(input_dtype)
79
+
80
+ def extra_repr(self):
81
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
82
+
83
+
84
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen3
85
+ class Qwen3RotaryEmbedding(nn.Module):
86
+ def __init__(
87
+ self,
88
+ dim=None,
89
+ max_position_embeddings=2048,
90
+ base=10000,
91
+ device=None,
92
+ scaling_factor=1.0,
93
+ rope_type="default",
94
+ config: Optional[Qwen3Config] = None,
95
+ ):
96
+ super().__init__()
97
+ # TODO (joao): remove the `if` below, only used for BC
98
+ self.rope_kwargs = {}
99
+ if config is None:
100
+ logger.warning_once(
101
+ "`Qwen3RotaryEmbedding` can now be fully parameterized by passing the model config through the "
102
+ "`config` argument. All other arguments will be removed in v4.46"
103
+ )
104
+ self.rope_kwargs = {
105
+ "rope_type": rope_type,
106
+ "factor": scaling_factor,
107
+ "dim": dim,
108
+ "base": base,
109
+ "max_position_embeddings": max_position_embeddings,
110
+ }
111
+ self.rope_type = rope_type
112
+ self.max_seq_len_cached = max_position_embeddings
113
+ self.original_max_seq_len = max_position_embeddings
114
+ else:
115
+ # BC: "rope_type" was originally "type"
116
+ if config.rope_scaling is not None:
117
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
118
+ else:
119
+ self.rope_type = "default"
120
+ self.max_seq_len_cached = config.max_position_embeddings
121
+ self.original_max_seq_len = config.max_position_embeddings
122
+
123
+ self.config = config
124
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
125
+
126
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
127
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
128
+ self.original_inv_freq = self.inv_freq
129
+
130
+ def _dynamic_frequency_update(self, position_ids, device):
131
+ """
132
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
133
+ 1 - growing beyond the cached sequence length (allow scaling)
134
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
135
+ """
136
+ seq_len = torch.max(position_ids) + 1
137
+ if seq_len > self.max_seq_len_cached: # growth
138
+ inv_freq, self.attention_scaling = self.rope_init_fn(
139
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
140
+ )
141
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
142
+ self.max_seq_len_cached = seq_len
143
+
144
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
145
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
146
+ self.max_seq_len_cached = self.original_max_seq_len
147
+
148
+ @torch.no_grad()
149
+ def forward(self, x, position_ids):
150
+ if "dynamic" in self.rope_type:
151
+ self._dynamic_frequency_update(position_ids, device=x.device)
152
+
153
+ # Core RoPE block
154
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
155
+ position_ids_expanded = position_ids[:, None, :].float()
156
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
157
+ device_type = x.device.type
158
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
159
+ with torch.autocast(device_type=device_type, enabled=False):
160
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
161
+ emb = torch.cat((freqs, freqs), dim=-1)
162
+ cos = emb.cos()
163
+ sin = emb.sin()
164
+
165
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
166
+ cos = cos * self.attention_scaling
167
+ sin = sin * self.attention_scaling
168
+
169
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
170
+
171
+
172
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
173
+ def rotate_half(x):
174
+ """Rotates half the hidden dims of the input."""
175
+ x1 = x[..., : x.shape[-1] // 2]
176
+ x2 = x[..., x.shape[-1] // 2:]
177
+ return torch.cat((-x2, x1), dim=-1)
178
+
179
+
180
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
181
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
182
+ """Applies Rotary Position Embedding to the query and key tensors.
183
+
184
+ Args:
185
+ q (`torch.Tensor`): The query tensor.
186
+ k (`torch.Tensor`): The key tensor.
187
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
188
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
189
+ position_ids (`torch.Tensor`, *optional*):
190
+ Deprecated and unused.
191
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
192
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
193
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
194
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
195
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
196
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
197
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
198
+ Returns:
199
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
200
+ """
201
+ cos = cos.unsqueeze(unsqueeze_dim)
202
+ sin = sin.unsqueeze(unsqueeze_dim)
203
+ q_embed = (q * cos) + (rotate_half(q) * sin)
204
+ k_embed = (k * cos) + (rotate_half(k) * sin)
205
+ return q_embed, k_embed
206
+
207
+
208
+ # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen3
209
+ class Qwen3MLP(nn.Module):
210
+ def __init__(self, config):
211
+ super().__init__()
212
+ self.hidden_size = config.hidden_size
213
+ self.intermediate_size = config.intermediate_size
214
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
215
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
216
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
217
+ self.act_fn = ACT2FN[config.hidden_act]
218
+
219
+ def forward(self, hidden_state):
220
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
221
+
222
+
223
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
224
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
225
+ """
226
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
227
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
228
+ """
229
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
230
+ if n_rep == 1:
231
+ return hidden_states
232
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
233
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
234
+
235
+
236
+ class Qwen3Attention(nn.Module):
237
+ """
238
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
239
+ and "Generating Long Sequences with Sparse Transformers".
240
+ """
241
+
242
+ def __init__(self, config: Qwen3Config, layer_idx: Optional[int] = None):
243
+ super().__init__()
244
+ self.config = config
245
+ self.layer_idx = layer_idx
246
+ if layer_idx is None:
247
+ logger.warning_once(
248
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
249
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
250
+ "when creating this class."
251
+ )
252
+
253
+ self.hidden_size = config.hidden_size
254
+ self.num_heads = config.num_attention_heads
255
+ self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
256
+ self.num_key_value_heads = config.num_key_value_heads
257
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
258
+ self.max_position_embeddings = config.max_position_embeddings
259
+ self.rope_theta = config.rope_theta
260
+ self.is_causal = True
261
+ self.attention_dropout = config.attention_dropout
262
+ self.use_qk_norm = config.use_qk_norm
263
+ self.headwise_attn_output_gate = config.headwise_attn_output_gate
264
+ self.elementwise_attn_output_gate = config.elementwise_attn_output_gate
265
+
266
+ # if (self.head_dim * self.num_heads) != self.hidden_size:
267
+ # raise ValueError(
268
+ # f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
269
+ # f" and `num_heads`: {self.num_heads})."
270
+ # )
271
+ if self.headwise_attn_output_gate:
272
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim + self.num_heads, bias=config.qkv_bias)
273
+ elif self.elementwise_attn_output_gate:
274
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim * 2, bias=config.qkv_bias)
275
+ else:
276
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.qkv_bias)
277
+
278
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
279
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
280
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.qkv_bias)
281
+ if self.use_qk_norm:
282
+ self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
283
+ self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
284
+
285
+ self.rotary_emb = Qwen3RotaryEmbedding(config=self.config)
286
+
287
+ def forward(
288
+ self,
289
+ hidden_states: torch.Tensor,
290
+ attention_mask: Optional[torch.Tensor] = None,
291
+ position_ids: Optional[torch.LongTensor] = None,
292
+ past_key_value: Optional[Cache] = None,
293
+ output_attentions: bool = False,
294
+ use_cache: bool = False,
295
+ cache_position: Optional[torch.LongTensor] = None,
296
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
297
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
298
+ bsz, q_len, _ = hidden_states.size()
299
+
300
+ query_states = self.q_proj(hidden_states)
301
+ key_states = self.k_proj(hidden_states)
302
+ value_states = self.v_proj(hidden_states)
303
+
304
+ if self.headwise_attn_output_gate:
305
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
306
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
307
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
308
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
309
+ elif self.elementwise_attn_output_gate:
310
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
311
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
312
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
313
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
314
+ else:
315
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
316
+
317
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
318
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
319
+
320
+
321
+ if self.use_qk_norm:
322
+ query_states = self.q_norm(query_states)
323
+ key_states = self.k_norm(key_states)
324
+
325
+ cos, sin = position_embeddings
326
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
327
+
328
+ if past_key_value is not None:
329
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
330
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
331
+
332
+ # repeat k/v heads if n_kv_heads < n_heads
333
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
334
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
335
+
336
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
337
+ if attention_mask is not None: # no matter the length, we just slice it
338
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
339
+ attn_weights = attn_weights + causal_mask
340
+
341
+ # upcast attention to fp32
342
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
343
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
344
+
345
+ attn_output = torch.matmul(attn_weights, value_states)
346
+
347
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
348
+ raise ValueError(
349
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
350
+ f" {attn_output.size()}"
351
+ )
352
+
353
+ attn_output = attn_output.transpose(1, 2).contiguous()
354
+
355
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
356
+ attn_output = attn_output * torch.sigmoid(gate_score)
357
+
358
+ attn_output = attn_output.reshape(bsz, q_len, -1)
359
+
360
+ attn_output = self.o_proj(attn_output)
361
+
362
+ if not output_attentions:
363
+ attn_weights = None
364
+
365
+ return attn_output, attn_weights, past_key_value
366
+
367
+
368
+ class Qwen3FlashAttention2(Qwen3Attention):
369
+ """
370
+ Qwen3 flash attention module, following Qwen3 attention module. This module inherits from `Qwen3Attention`
371
+ as the weights of the module stays untouched. The only required change would be on the forward pass
372
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
373
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
374
+ config.max_window_layers layers.
375
+ """
376
+
377
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
378
+ def __init__(self, *args, **kwargs):
379
+ super().__init__(*args, **kwargs)
380
+
381
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
382
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
383
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
384
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
385
+
386
+ def forward(
387
+ self,
388
+ hidden_states: torch.Tensor,
389
+ attention_mask: Optional[torch.Tensor] = None,
390
+ position_ids: Optional[torch.LongTensor] = None,
391
+ past_key_value: Optional[Cache] = None,
392
+ output_attentions: bool = False,
393
+ use_cache: bool = False,
394
+ cache_position: Optional[torch.LongTensor] = None,
395
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
396
+ ):
397
+ bsz, q_len, _ = hidden_states.size()
398
+
399
+ query_states = self.q_proj(hidden_states)
400
+ key_states = self.k_proj(hidden_states)
401
+ value_states = self.v_proj(hidden_states)
402
+
403
+ if self.headwise_attn_output_gate:
404
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
405
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
406
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
407
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
408
+ elif self.elementwise_attn_output_gate:
409
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
410
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
411
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
412
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
413
+ else:
414
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
415
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
416
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
417
+
418
+ if self.use_qk_norm:
419
+ query_states = self.q_norm(query_states)
420
+ key_states = self.k_norm(key_states)
421
+
422
+ cos, sin = position_embeddings
423
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
424
+
425
+ if past_key_value is not None:
426
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
427
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
428
+
429
+ # repeat k/v heads if n_kv_heads < n_heads
430
+ # key_states = repeat_kv(key_states, self.num_key_value_groups)
431
+ # value_states = repeat_kv(value_states, self.num_key_value_groups)
432
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
433
+
434
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
435
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
436
+ # cast them back in float16 just to be sure everything works as expected.
437
+ input_dtype = query_states.dtype
438
+ if input_dtype == torch.float32:
439
+ if torch.is_autocast_enabled():
440
+ target_dtype = torch.get_autocast_gpu_dtype()
441
+ # Handle the case where the model is quantized
442
+ elif hasattr(self.config, "_pre_quantization_dtype"):
443
+ target_dtype = self.config._pre_quantization_dtype
444
+ else:
445
+ target_dtype = self.q_proj.weight.dtype
446
+
447
+ logger.warning_once(
448
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
449
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
450
+ f" {target_dtype}."
451
+ )
452
+
453
+ query_states = query_states.to(target_dtype)
454
+ key_states = key_states.to(target_dtype)
455
+ value_states = value_states.to(target_dtype)
456
+
457
+ # Reashape to the expected shape for Flash Attention
458
+ query_states = query_states.transpose(1, 2)
459
+ key_states = key_states.transpose(1, 2)
460
+ value_states = value_states.transpose(1, 2)
461
+
462
+ if (
463
+ self.config.use_sliding_window
464
+ and getattr(self.config, "sliding_window", None) is not None
465
+ and self.layer_idx >= self.config.max_window_layers
466
+ ):
467
+ sliding_window = self.config.sliding_window
468
+ else:
469
+ sliding_window = None
470
+ attn_output = _flash_attention_forward(
471
+ query_states,
472
+ key_states,
473
+ value_states,
474
+ attention_mask,
475
+ q_len,
476
+ position_ids=position_ids,
477
+ dropout=dropout_rate,
478
+ sliding_window=sliding_window,
479
+ is_causal=self.is_causal,
480
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
481
+ )
482
+
483
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
484
+ attn_output = attn_output * torch.sigmoid(gate_score)
485
+
486
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
487
+ attn_output = self.o_proj(attn_output)
488
+ if not output_attentions:
489
+ attn_weights = None
490
+
491
+ return attn_output, attn_weights, past_key_value
492
+
493
+
494
+
495
+ class Qwen3SdpaAttention(Qwen3Attention):
496
+ """
497
+ Qwen3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
498
+ `Qwen3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
499
+ SDPA API.
500
+ """
501
+
502
+ # Adapted from Qwen3Attention.forward
503
+ def forward(
504
+ self,
505
+ hidden_states: torch.Tensor,
506
+ attention_mask: Optional[torch.Tensor] = None,
507
+ position_ids: Optional[torch.LongTensor] = None,
508
+ past_key_value: Optional[Cache] = None,
509
+ output_attentions: bool = False,
510
+ use_cache: bool = False,
511
+ cache_position: Optional[torch.LongTensor] = None,
512
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
513
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
514
+ if output_attentions:
515
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
516
+ logger.warning_once(
517
+ "Qwen3Model is using Qwen3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
518
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
519
+ )
520
+ return super().forward(
521
+ hidden_states=hidden_states,
522
+ attention_mask=attention_mask,
523
+ position_ids=position_ids,
524
+ past_key_value=past_key_value,
525
+ output_attentions=output_attentions,
526
+ use_cache=use_cache,
527
+ cache_position=cache_position,
528
+ position_embeddings=position_embeddings,
529
+ )
530
+
531
+ bsz, q_len, _ = hidden_states.size()
532
+
533
+ query_states = self.q_proj(hidden_states)
534
+ key_states = self.k_proj(hidden_states)
535
+ value_states = self.v_proj(hidden_states)
536
+
537
+ if self.headwise_attn_output_gate:
538
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
539
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
540
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
541
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
542
+ elif self.elementwise_attn_output_gate:
543
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
544
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
545
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
546
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
547
+ else:
548
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
549
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
550
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
551
+
552
+ if self.use_qk_norm:
553
+ query_states = self.q_norm(query_states)
554
+ key_states = self.k_norm(key_states)
555
+
556
+ cos, sin = position_embeddings
557
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
558
+
559
+ if past_key_value is not None:
560
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
561
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
562
+
563
+ # key_states: bs, head, q_len, head_dim
564
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
565
+
566
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
567
+
568
+ causal_mask = attention_mask
569
+ if attention_mask is not None: # no matter the length, we just slice it
570
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
571
+
572
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
573
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
574
+ if query_states.device.type == "cuda" and attention_mask is not None:
575
+ query_states = query_states.contiguous()
576
+ key_states = key_states.contiguous()
577
+ value_states = value_states.contiguous()
578
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
579
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
580
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
581
+ is_causal = True if causal_mask is None and q_len > 1 else False
582
+
583
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
584
+ query_states,
585
+ key_states,
586
+ value_states,
587
+ attn_mask=causal_mask,
588
+ dropout_p=self.attention_dropout if self.training else 0.0,
589
+ is_causal=is_causal,
590
+ )
591
+
592
+ attn_output = attn_output.transpose(1, 2).contiguous()
593
+
594
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
595
+ attn_output = attn_output * torch.sigmoid(gate_score)
596
+
597
+ attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
598
+
599
+ attn_output = self.o_proj(attn_output)
600
+
601
+ return attn_output, None, past_key_value
602
+
603
+ QWEN3_ATTENTION_CLASSES = {
604
+ "eager": Qwen3Attention,
605
+ "flash_attention_2": Qwen3FlashAttention2,
606
+ "sdpa": Qwen3SdpaAttention,
607
+ }
608
+
609
+
610
+ class Qwen3DecoderLayer(nn.Module):
611
+ def __init__(self, config: Qwen3Config, layer_idx: int):
612
+ super().__init__()
613
+ self.hidden_size = config.hidden_size
614
+
615
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
616
+ logger.warning_once(
617
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
618
+ "unexpected results may be encountered."
619
+ )
620
+ self.self_attn = QWEN3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
621
+
622
+ self.mlp = Qwen3MLP(config)
623
+ self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
624
+ self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
625
+
626
+ def forward(
627
+ self,
628
+ hidden_states: torch.Tensor,
629
+ attention_mask: Optional[torch.Tensor] = None,
630
+ position_ids: Optional[torch.LongTensor] = None,
631
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
632
+ output_attentions: Optional[bool] = False,
633
+ use_cache: Optional[bool] = False,
634
+ cache_position: Optional[torch.LongTensor] = None,
635
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
636
+ **kwargs,
637
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
638
+ """
639
+ Args:
640
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
641
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
642
+ `(batch, sequence_length)` where padding elements are indicated by 0.
643
+ output_attentions (`bool`, *optional*):
644
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
645
+ returned tensors for more detail.
646
+ use_cache (`bool`, *optional*):
647
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
648
+ (see `past_key_values`).
649
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
650
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
651
+ Indices depicting the position of the input sequence tokens in the sequence.
652
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
653
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
654
+ with `head_dim` being the embedding dimension of each attention head.
655
+ kwargs (`dict`, *optional*):
656
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
657
+ into the model
658
+ """
659
+
660
+ residual = hidden_states
661
+
662
+ hidden_states = self.input_layernorm(hidden_states)
663
+
664
+ # Self Attention
665
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
666
+ hidden_states=hidden_states,
667
+ attention_mask=attention_mask,
668
+ position_ids=position_ids,
669
+ past_key_value=past_key_value,
670
+ output_attentions=output_attentions,
671
+ use_cache=use_cache,
672
+ cache_position=cache_position,
673
+ position_embeddings=position_embeddings,
674
+ )
675
+ hidden_states = residual + hidden_states
676
+
677
+ # Fully Connected
678
+ residual = hidden_states
679
+ hidden_states = self.post_attention_layernorm(hidden_states)
680
+ hidden_states = self.mlp(hidden_states)
681
+ hidden_states = residual + hidden_states
682
+
683
+ outputs = (hidden_states,)
684
+
685
+ if output_attentions:
686
+ outputs += (self_attn_weights,)
687
+
688
+ if use_cache:
689
+ outputs += (present_key_value,)
690
+
691
+ return outputs
692
+
693
+
694
+ QWEN3_START_DOCSTRING = r"""
695
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
696
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
697
+ etc.)
698
+
699
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
700
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
701
+ and behavior.
702
+
703
+ Parameters:
704
+ config ([`Qwen3Config`]):
705
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
706
+ load the weights associated with the model, only the configuration. Check out the
707
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
708
+ """
709
+
710
+
711
+ @add_start_docstrings(
712
+ "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
713
+ QWEN3_START_DOCSTRING,
714
+ )
715
+ class Qwen3PreTrainedModel(PreTrainedModel):
716
+ config_class = Qwen3Config
717
+ base_model_prefix = "model"
718
+ supports_gradient_checkpointing = True
719
+ _no_split_modules = ["Qwen3DecoderLayer"]
720
+ _skip_keys_device_placement = "past_key_values"
721
+ _supports_flash_attn_2 = True
722
+ _supports_sdpa = True
723
+ _supports_cache_class = True
724
+ _supports_quantized_cache = True
725
+ _supports_static_cache = True
726
+
727
+ def _init_weights(self, module):
728
+ std = self.config.initializer_range
729
+ if isinstance(module, nn.Linear):
730
+ module.weight.data.normal_(mean=0.0, std=std)
731
+ if module.bias is not None:
732
+ module.bias.data.zero_()
733
+ elif isinstance(module, nn.Embedding):
734
+ module.weight.data.normal_(mean=0.0, std=std)
735
+ if module.padding_idx is not None:
736
+ module.weight.data[module.padding_idx].zero_()
737
+
738
+
739
+ QWEN3_INPUTS_DOCSTRING = r"""
740
+ Args:
741
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
742
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
743
+ it.
744
+
745
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
746
+ [`PreTrainedTokenizer.__call__`] for details.
747
+
748
+ [What are input IDs?](../glossary#input-ids)
749
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
750
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
751
+
752
+ - 1 for tokens that are **not masked**,
753
+ - 0 for tokens that are **masked**.
754
+
755
+ [What are attention masks?](../glossary#attention-mask)
756
+
757
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
758
+ [`PreTrainedTokenizer.__call__`] for details.
759
+
760
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
761
+ `past_key_values`).
762
+
763
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
764
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
765
+ information on the default strategy.
766
+
767
+ - 1 indicates the head is **not masked**,
768
+ - 0 indicates the head is **masked**.
769
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
770
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
771
+ config.n_positions - 1]`.
772
+
773
+ [What are position IDs?](../glossary#position-ids)
774
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
775
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
776
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
777
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
778
+
779
+ Two formats are allowed:
780
+ - a [`~cache_utils.Cache`] instance, see our
781
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
782
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
783
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
784
+ cache format.
785
+
786
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
787
+ legacy cache format will be returned.
788
+
789
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
790
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
791
+ of shape `(batch_size, sequence_length)`.
792
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
793
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
794
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
795
+ model's internal embedding lookup matrix.
796
+ use_cache (`bool`, *optional*):
797
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
798
+ `past_key_values`).
799
+ output_attentions (`bool`, *optional*):
800
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
801
+ tensors for more detail.
802
+ output_hidden_states (`bool`, *optional*):
803
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
804
+ more detail.
805
+ return_dict (`bool`, *optional*):
806
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
807
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
808
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
809
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
810
+ the complete sequence length.
811
+ """
812
+
813
+
814
+ @add_start_docstrings(
815
+ "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
816
+ QWEN3_START_DOCSTRING,
817
+ )
818
+ class Qwen3Model(Qwen3PreTrainedModel):
819
+ """
820
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
821
+
822
+ Args:
823
+ config: Qwen3Config
824
+ """
825
+
826
+ def __init__(self, config: Qwen3Config):
827
+ super().__init__(config)
828
+ self.padding_idx = config.pad_token_id
829
+ self.vocab_size = config.vocab_size
830
+
831
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
832
+ self.layers = nn.ModuleList(
833
+ [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
834
+ )
835
+ self._attn_implementation = config._attn_implementation
836
+ self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
837
+ self.rotary_emb = Qwen3RotaryEmbedding(config=config)
838
+
839
+ self.gradient_checkpointing = False
840
+ # Initialize weights and apply final processing
841
+ self.post_init()
842
+
843
+ def get_input_embeddings(self):
844
+ return self.embed_tokens
845
+
846
+ def set_input_embeddings(self, value):
847
+ self.embed_tokens = value
848
+
849
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
850
+ def forward(
851
+ self,
852
+ input_ids: torch.LongTensor = None,
853
+ attention_mask: Optional[torch.Tensor] = None,
854
+ position_ids: Optional[torch.LongTensor] = None,
855
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
856
+ inputs_embeds: Optional[torch.FloatTensor] = None,
857
+ use_cache: Optional[bool] = None,
858
+ output_attentions: Optional[bool] = None,
859
+ output_hidden_states: Optional[bool] = None,
860
+ return_dict: Optional[bool] = None,
861
+ cache_position: Optional[torch.LongTensor] = None,
862
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
863
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
864
+ output_hidden_states = (
865
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
866
+ )
867
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
868
+
869
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
870
+
871
+ if (input_ids is None) ^ (inputs_embeds is not None):
872
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
873
+
874
+ if self.gradient_checkpointing and self.training:
875
+ if use_cache:
876
+ logger.warning_once(
877
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
878
+ )
879
+ use_cache = False
880
+
881
+ # kept for BC (non `Cache` `past_key_values` inputs)
882
+ return_legacy_cache = False
883
+ if use_cache and not isinstance(past_key_values, Cache):
884
+ return_legacy_cache = True
885
+ if past_key_values is None:
886
+ past_key_values = DynamicCache()
887
+ else:
888
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
889
+ logger.warning_once(
890
+ "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
891
+ "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
892
+ "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
893
+ )
894
+
895
+ if inputs_embeds is None:
896
+ inputs_embeds = self.embed_tokens(input_ids)
897
+
898
+ if cache_position is None:
899
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
900
+ cache_position = torch.arange(
901
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
902
+ )
903
+ if position_ids is None:
904
+ position_ids = cache_position.unsqueeze(0)
905
+
906
+ causal_mask = self._update_causal_mask(
907
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
908
+ )
909
+
910
+ hidden_states = inputs_embeds
911
+
912
+ # create position embeddings to be shared across the decoder layers
913
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
914
+
915
+ # decoder layers
916
+ all_hidden_states = () if output_hidden_states else None
917
+ all_self_attns = () if output_attentions else None
918
+ next_decoder_cache = None
919
+
920
+ for decoder_layer in self.layers:
921
+ if output_hidden_states:
922
+ all_hidden_states += (hidden_states,)
923
+
924
+ if self.gradient_checkpointing and self.training:
925
+ layer_outputs = self._gradient_checkpointing_func(
926
+ decoder_layer.__call__,
927
+ hidden_states,
928
+ causal_mask,
929
+ position_ids,
930
+ past_key_values,
931
+ output_attentions,
932
+ use_cache,
933
+ cache_position,
934
+ position_embeddings,
935
+ )
936
+ else:
937
+ layer_outputs = decoder_layer(
938
+ hidden_states,
939
+ attention_mask=causal_mask,
940
+ position_ids=position_ids,
941
+ past_key_value=past_key_values,
942
+ output_attentions=output_attentions,
943
+ use_cache=use_cache,
944
+ cache_position=cache_position,
945
+ position_embeddings=position_embeddings,
946
+ )
947
+
948
+ hidden_states = layer_outputs[0]
949
+
950
+ if use_cache:
951
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
952
+
953
+ if output_attentions:
954
+ all_self_attns += (layer_outputs[1],)
955
+
956
+ hidden_states = self.norm(hidden_states)
957
+
958
+ # add hidden states from the last decoder layer
959
+ if output_hidden_states:
960
+ all_hidden_states += (hidden_states,)
961
+
962
+ next_cache = next_decoder_cache if use_cache else None
963
+ if return_legacy_cache:
964
+ next_cache = next_cache.to_legacy_cache()
965
+
966
+ if not return_dict:
967
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
968
+ return BaseModelOutputWithPast(
969
+ last_hidden_state=hidden_states,
970
+ past_key_values=next_cache,
971
+ hidden_states=all_hidden_states,
972
+ attentions=all_self_attns,
973
+ )
974
+
975
+ # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
976
+ def _update_causal_mask(
977
+ self,
978
+ attention_mask: torch.Tensor,
979
+ input_tensor: torch.Tensor,
980
+ cache_position: torch.Tensor,
981
+ past_key_values: Cache,
982
+ output_attentions: bool,
983
+ ):
984
+ if self.config._attn_implementation == "flash_attention_2":
985
+ if attention_mask is not None and 0.0 in attention_mask:
986
+ return attention_mask
987
+ return None
988
+
989
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
990
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
991
+ # to infer the attention mask.
992
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
993
+ using_static_cache = isinstance(past_key_values, StaticCache)
994
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
995
+
996
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
997
+ if (
998
+ self.config._attn_implementation == "sdpa"
999
+ and not (using_static_cache or using_sliding_window_cache)
1000
+ and not output_attentions
1001
+ ):
1002
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1003
+ attention_mask,
1004
+ inputs_embeds=input_tensor,
1005
+ past_key_values_length=past_seen_tokens,
1006
+ sliding_window=self.config.sliding_window,
1007
+ is_training=self.training,
1008
+ ):
1009
+ return None
1010
+
1011
+ dtype, device = input_tensor.dtype, input_tensor.device
1012
+ min_dtype = torch.finfo(dtype).min
1013
+ sequence_length = input_tensor.shape[1]
1014
+ # SlidingWindowCache or StaticCache
1015
+ if using_sliding_window_cache or using_static_cache:
1016
+ target_length = past_key_values.get_max_cache_shape()
1017
+ # DynamicCache or no cache
1018
+ else:
1019
+ target_length = (
1020
+ attention_mask.shape[-1]
1021
+ if isinstance(attention_mask, torch.Tensor)
1022
+ else past_seen_tokens + sequence_length + 1
1023
+ )
1024
+
1025
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1026
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
1027
+ attention_mask,
1028
+ sequence_length=sequence_length,
1029
+ target_length=target_length,
1030
+ dtype=dtype,
1031
+ device=device,
1032
+ cache_position=cache_position,
1033
+ batch_size=input_tensor.shape[0],
1034
+ config=self.config,
1035
+ past_key_values=past_key_values,
1036
+ )
1037
+
1038
+ if (
1039
+ self.config._attn_implementation == "sdpa"
1040
+ and attention_mask is not None
1041
+ and attention_mask.device.type == "cuda"
1042
+ and not output_attentions
1043
+ ):
1044
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1045
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1046
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1047
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1048
+
1049
+ return causal_mask
1050
+
1051
+ @staticmethod
1052
+ # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen3
1053
+ def _prepare_4d_causal_attention_mask_with_cache_position(
1054
+ attention_mask: torch.Tensor,
1055
+ sequence_length: int,
1056
+ target_length: int,
1057
+ dtype: torch.dtype,
1058
+ device: torch.device,
1059
+ cache_position: torch.Tensor,
1060
+ batch_size: int,
1061
+ config: Qwen3Config,
1062
+ past_key_values: Cache,
1063
+ ):
1064
+ """
1065
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1066
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1067
+
1068
+ Args:
1069
+ attention_mask (`torch.Tensor`):
1070
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1071
+ sequence_length (`int`):
1072
+ The sequence length being processed.
1073
+ target_length (`int`):
1074
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1075
+ dtype (`torch.dtype`):
1076
+ The dtype to use for the 4D attention mask.
1077
+ device (`torch.device`):
1078
+ The device to plcae the 4D attention mask on.
1079
+ cache_position (`torch.Tensor`):
1080
+ Indices depicting the position of the input sequence tokens in the sequence.
1081
+ batch_size (`torch.Tensor`):
1082
+ Batch size.
1083
+ config (`Qwen3Config`):
1084
+ The model's configuration class
1085
+ past_key_values (`Cache`):
1086
+ The cache class that is being used currently to generate
1087
+ """
1088
+ if attention_mask is not None and attention_mask.dim() == 4:
1089
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1090
+ causal_mask = attention_mask
1091
+ else:
1092
+ min_dtype = torch.finfo(dtype).min
1093
+ causal_mask = torch.full(
1094
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1095
+ )
1096
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1097
+ if config.sliding_window is not None:
1098
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1099
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
1100
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1101
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
1102
+ cache_position.reshape(-1, 1) - config.sliding_window
1103
+ )
1104
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1105
+ causal_mask *= diagonal_attend_mask
1106
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
1107
+ if attention_mask is not None:
1108
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1109
+ if attention_mask.shape[-1] > target_length:
1110
+ attention_mask = attention_mask[:, :target_length]
1111
+ mask_length = attention_mask.shape[-1]
1112
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1113
+ padding_mask = padding_mask == 0
1114
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1115
+ padding_mask, min_dtype
1116
+ )
1117
+ return causal_mask
1118
+
1119
+
1120
+ class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
1121
+ _tied_weights_keys = ["lm_head.weight"]
1122
+ _tp_plan = {"lm_head": "colwise_rep"}
1123
+
1124
+ def __init__(self, config):
1125
+ super().__init__(config)
1126
+ self.model = Qwen3Model(config)
1127
+ self.vocab_size = config.vocab_size
1128
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1129
+
1130
+ # Initialize weights and apply final processing
1131
+ self.post_init()
1132
+
1133
+ def get_input_embeddings(self):
1134
+ return self.model.embed_tokens
1135
+
1136
+ def set_input_embeddings(self, value):
1137
+ self.model.embed_tokens = value
1138
+
1139
+ def get_output_embeddings(self):
1140
+ return self.lm_head
1141
+
1142
+ def set_output_embeddings(self, new_embeddings):
1143
+ self.lm_head = new_embeddings
1144
+
1145
+ def set_decoder(self, decoder):
1146
+ self.model = decoder
1147
+
1148
+ def get_decoder(self):
1149
+ return self.model
1150
+
1151
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1152
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1153
+ def forward(
1154
+ self,
1155
+ input_ids: torch.LongTensor = None,
1156
+ attention_mask: Optional[torch.Tensor] = None,
1157
+ position_ids: Optional[torch.LongTensor] = None,
1158
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1159
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1160
+ labels: Optional[torch.LongTensor] = None,
1161
+ use_cache: Optional[bool] = None,
1162
+ output_attentions: Optional[bool] = None,
1163
+ output_hidden_states: Optional[bool] = None,
1164
+ return_dict: Optional[bool] = None,
1165
+ cache_position: Optional[torch.LongTensor] = None,
1166
+ num_logits_to_keep: int = 0,
1167
+ **loss_kwargs,
1168
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1169
+ r"""
1170
+ Args:
1171
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1172
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1173
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1174
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1175
+
1176
+ num_logits_to_keep (`int`, *optional*):
1177
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1178
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1179
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1180
+
1181
+ Returns:
1182
+
1183
+ Example:
1184
+
1185
+ ```python
1186
+ >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
1187
+
1188
+ >>> model = Qwen3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1189
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1190
+
1191
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1192
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1193
+
1194
+ >>> # Generate
1195
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1196
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1197
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1198
+ ```"""
1199
+
1200
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1201
+ output_hidden_states = (
1202
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1203
+ )
1204
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1205
+
1206
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1207
+ outputs = self.model(
1208
+ input_ids=input_ids,
1209
+ attention_mask=attention_mask,
1210
+ position_ids=position_ids,
1211
+ past_key_values=past_key_values,
1212
+ inputs_embeds=inputs_embeds,
1213
+ use_cache=use_cache,
1214
+ output_attentions=output_attentions,
1215
+ output_hidden_states=output_hidden_states,
1216
+ return_dict=return_dict,
1217
+ cache_position=cache_position,
1218
+ )
1219
+
1220
+ hidden_states = outputs[0]
1221
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1222
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
1223
+
1224
+ loss = None
1225
+ if labels is not None:
1226
+ loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
1227
+
1228
+ if not return_dict:
1229
+ output = (logits,) + outputs[1:]
1230
+ return (loss,) + output if loss is not None else output
1231
+
1232
+ return CausalLMOutputWithPast(
1233
+ loss=loss,
1234
+ logits=logits,
1235
+ past_key_values=outputs.past_key_values,
1236
+ hidden_states=outputs.hidden_states,
1237
+ attentions=outputs.attentions,
1238
+ )
1239
+
1240
+
1241
+ @add_start_docstrings(
1242
+ """
1243
+ The Qwen3 Model transformer with a sequence classification head on top (linear layer).
1244
+
1245
+ [`Qwen3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1246
+ (e.g. GPT-2) do.
1247
+
1248
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1249
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1250
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1251
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1252
+ each row of the batch).
1253
+ """,
1254
+ QWEN3_START_DOCSTRING,
1255
+ )
1256
+ class Qwen3ForSequenceClassification(Qwen3PreTrainedModel):
1257
+ def __init__(self, config):
1258
+ super().__init__(config)
1259
+ self.num_labels = config.num_labels
1260
+ self.model = Qwen3Model(config)
1261
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1262
+
1263
+ # Initialize weights and apply final processing
1264
+ self.post_init()
1265
+
1266
+ def get_input_embeddings(self):
1267
+ return self.model.embed_tokens
1268
+
1269
+ def set_input_embeddings(self, value):
1270
+ self.model.embed_tokens = value
1271
+
1272
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1273
+ def forward(
1274
+ self,
1275
+ input_ids: torch.LongTensor = None,
1276
+ attention_mask: Optional[torch.Tensor] = None,
1277
+ position_ids: Optional[torch.LongTensor] = None,
1278
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1279
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1280
+ labels: Optional[torch.LongTensor] = None,
1281
+ use_cache: Optional[bool] = None,
1282
+ output_attentions: Optional[bool] = None,
1283
+ output_hidden_states: Optional[bool] = None,
1284
+ return_dict: Optional[bool] = None,
1285
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1286
+ r"""
1287
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1288
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1289
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1290
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1291
+ """
1292
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1293
+
1294
+ transformer_outputs = self.model(
1295
+ input_ids,
1296
+ attention_mask=attention_mask,
1297
+ position_ids=position_ids,
1298
+ past_key_values=past_key_values,
1299
+ inputs_embeds=inputs_embeds,
1300
+ use_cache=use_cache,
1301
+ output_attentions=output_attentions,
1302
+ output_hidden_states=output_hidden_states,
1303
+ return_dict=return_dict,
1304
+ )
1305
+ hidden_states = transformer_outputs[0]
1306
+ logits = self.score(hidden_states)
1307
+
1308
+ if input_ids is not None:
1309
+ batch_size = input_ids.shape[0]
1310
+ else:
1311
+ batch_size = inputs_embeds.shape[0]
1312
+
1313
+ if self.config.pad_token_id is None and batch_size != 1:
1314
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1315
+ if self.config.pad_token_id is None:
1316
+ sequence_lengths = -1
1317
+ else:
1318
+ if input_ids is not None:
1319
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1320
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1321
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1322
+ sequence_lengths = sequence_lengths.to(logits.device)
1323
+ else:
1324
+ sequence_lengths = -1
1325
+
1326
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1327
+
1328
+ loss = None
1329
+ if labels is not None:
1330
+ labels = labels.to(logits.device)
1331
+ if self.config.problem_type is None:
1332
+ if self.num_labels == 1:
1333
+ self.config.problem_type = "regression"
1334
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1335
+ self.config.problem_type = "single_label_classification"
1336
+ else:
1337
+ self.config.problem_type = "multi_label_classification"
1338
+
1339
+ if self.config.problem_type == "regression":
1340
+ loss_fct = MSELoss()
1341
+ if self.num_labels == 1:
1342
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1343
+ else:
1344
+ loss = loss_fct(pooled_logits, labels)
1345
+ elif self.config.problem_type == "single_label_classification":
1346
+ loss_fct = CrossEntropyLoss()
1347
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1348
+ elif self.config.problem_type == "multi_label_classification":
1349
+ loss_fct = BCEWithLogitsLoss()
1350
+ loss = loss_fct(pooled_logits, labels)
1351
+ if not return_dict:
1352
+ output = (pooled_logits,) + transformer_outputs[1:]
1353
+ return ((loss,) + output) if loss is not None else output
1354
+
1355
+ return SequenceClassifierOutputWithPast(
1356
+ loss=loss,
1357
+ logits=pooled_logits,
1358
+ past_key_values=transformer_outputs.past_key_values,
1359
+ hidden_states=transformer_outputs.hidden_states,
1360
+ attentions=transformer_outputs.attentions,
1361
+ )
1362
+
1363
+
1364
+ @add_start_docstrings(
1365
+ """
1366
+ The Qwen3 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1367
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1368
+ """,
1369
+ QWEN3_START_DOCSTRING,
1370
+ )
1371
+ # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen3, LLAMA->QWEN3
1372
+ class Qwen3ForTokenClassification(Qwen3PreTrainedModel):
1373
+ def __init__(self, config):
1374
+ super().__init__(config)
1375
+ self.num_labels = config.num_labels
1376
+ self.model = Qwen3Model(config)
1377
+ if getattr(config, "classifier_dropout", None) is not None:
1378
+ classifier_dropout = config.classifier_dropout
1379
+ elif getattr(config, "hidden_dropout", None) is not None:
1380
+ classifier_dropout = config.hidden_dropout
1381
+ else:
1382
+ classifier_dropout = 0.1
1383
+ self.dropout = nn.Dropout(classifier_dropout)
1384
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1385
+
1386
+ # Initialize weights and apply final processing
1387
+ self.post_init()
1388
+
1389
+ def get_input_embeddings(self):
1390
+ return self.model.embed_tokens
1391
+
1392
+ def set_input_embeddings(self, value):
1393
+ self.model.embed_tokens = value
1394
+
1395
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1396
+ @add_code_sample_docstrings(
1397
+ checkpoint=_CHECKPOINT_FOR_DOC,
1398
+ output_type=TokenClassifierOutput,
1399
+ config_class=_CONFIG_FOR_DOC,
1400
+ )
1401
+ def forward(
1402
+ self,
1403
+ input_ids: Optional[torch.LongTensor] = None,
1404
+ attention_mask: Optional[torch.Tensor] = None,
1405
+ position_ids: Optional[torch.LongTensor] = None,
1406
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1407
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1408
+ labels: Optional[torch.LongTensor] = None,
1409
+ use_cache: Optional[bool] = None,
1410
+ output_attentions: Optional[bool] = None,
1411
+ output_hidden_states: Optional[bool] = None,
1412
+ return_dict: Optional[bool] = None,
1413
+ ) -> Union[Tuple, TokenClassifierOutput]:
1414
+ r"""
1415
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1416
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1417
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1418
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1419
+ """
1420
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1421
+
1422
+ outputs = self.model(
1423
+ input_ids,
1424
+ attention_mask=attention_mask,
1425
+ position_ids=position_ids,
1426
+ past_key_values=past_key_values,
1427
+ inputs_embeds=inputs_embeds,
1428
+ use_cache=use_cache,
1429
+ output_attentions=output_attentions,
1430
+ output_hidden_states=output_hidden_states,
1431
+ return_dict=return_dict,
1432
+ )
1433
+ sequence_output = outputs[0]
1434
+ sequence_output = self.dropout(sequence_output)
1435
+ logits = self.score(sequence_output)
1436
+
1437
+ loss = None
1438
+ if labels is not None:
1439
+ loss = self.loss_function(logits, labels, self.config)
1440
+
1441
+ if not return_dict:
1442
+ output = (logits,) + outputs[2:]
1443
+ return ((loss,) + output) if loss is not None else output
1444
+
1445
+ return TokenClassifierOutput(
1446
+ loss=loss,
1447
+ logits=logits,
1448
+ hidden_states=outputs.hidden_states,
1449
+ attentions=outputs.attentions,
1450
+ )
1451
+
1452
+
1453
+ @add_start_docstrings(
1454
+ """
1455
+ The Qwen3 Model transformer with a span classification head on top for extractive question-answering tasks like
1456
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1457
+ """,
1458
+ QWEN3_START_DOCSTRING,
1459
+ )
1460
+ # Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen3, MISTRAL->QWEN3
1461
+ class Qwen3ForQuestionAnswering(Qwen3PreTrainedModel):
1462
+ base_model_prefix = "model"
1463
+
1464
+ # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Qwen3
1465
+ def __init__(self, config):
1466
+ super().__init__(config)
1467
+ self.model = Qwen3Model(config)
1468
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1469
+
1470
+ # Initialize weights and apply final processing
1471
+ self.post_init()
1472
+
1473
+ def get_input_embeddings(self):
1474
+ return self.model.embed_tokens
1475
+
1476
+ def set_input_embeddings(self, value):
1477
+ self.model.embed_tokens = value
1478
+
1479
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1480
+ def forward(
1481
+ self,
1482
+ input_ids: Optional[torch.LongTensor] = None,
1483
+ attention_mask: Optional[torch.FloatTensor] = None,
1484
+ position_ids: Optional[torch.LongTensor] = None,
1485
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1486
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1487
+ start_positions: Optional[torch.LongTensor] = None,
1488
+ end_positions: Optional[torch.LongTensor] = None,
1489
+ output_attentions: Optional[bool] = None,
1490
+ output_hidden_states: Optional[bool] = None,
1491
+ return_dict: Optional[bool] = None,
1492
+ **kwargs,
1493
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1494
+ r"""
1495
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1496
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1497
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1498
+ are not taken into account for computing the loss.
1499
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1500
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1501
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1502
+ are not taken into account for computing the loss.
1503
+ """
1504
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1505
+
1506
+ outputs = self.model(
1507
+ input_ids,
1508
+ attention_mask=attention_mask,
1509
+ position_ids=position_ids,
1510
+ past_key_values=past_key_values,
1511
+ inputs_embeds=inputs_embeds,
1512
+ output_attentions=output_attentions,
1513
+ output_hidden_states=output_hidden_states,
1514
+ return_dict=return_dict,
1515
+ )
1516
+
1517
+ sequence_output = outputs[0]
1518
+
1519
+ logits = self.qa_outputs(sequence_output)
1520
+ start_logits, end_logits = logits.split(1, dim=-1)
1521
+ start_logits = start_logits.squeeze(-1).contiguous()
1522
+ end_logits = end_logits.squeeze(-1).contiguous()
1523
+
1524
+ loss = None
1525
+ if start_positions is not None and end_positions is not None:
1526
+ loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
1527
+
1528
+ if not return_dict:
1529
+ output = (start_logits, end_logits) + outputs[2:]
1530
+ return ((loss,) + output) if loss is not None else output
1531
+
1532
+ return QuestionAnsweringModelOutput(
1533
+ loss=loss,
1534
+ start_logits=start_logits,
1535
+ end_logits=end_logits,
1536
+ hidden_states=outputs.hidden_states,
1537
+ attentions=outputs.attentions,
1538
+ )
1539
+
1B_gate_elementwise/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:284bcb0e64b42027fe6fa13f513a1a5659370d4f681a009a53c32d9b20121e79
3
+ size 3456454434
1B_gate_elementwise/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
1B_gate_elementwise/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|object_ref_start|>",
185
+ "<|object_ref_end|>",
186
+ "<|box_start|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_start|>",
191
+ "<|vision_end|>",
192
+ "<|vision_pad|>",
193
+ "<|image_pad|>",
194
+ "<|video_pad|>"
195
+ ],
196
+ "bos_token": null,
197
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "model_max_length": 131072,
202
+ "pad_token": "<|endoftext|>",
203
+ "split_special_tokens": false,
204
+ "tokenizer_class": "Qwen2Tokenizer",
205
+ "unk_token": null,
206
+ "add_bos_token": false
207
+ }
1B_gate_elementwise/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
1B_gate_headwise/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_qwen3.Qwen3Config",
7
+ "AutoModel": "modeling_qwen3.Qwen3Model",
8
+ "AutoModelForCausalLM": "modeling_qwen3.Qwen3ForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "elementwise_attn_output_gate": false,
12
+ "headwise_attn_output_gate": true,
13
+ "bos_token_id": 151643,
14
+ "eos_token_id": 151643,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 6144,
19
+ "max_position_embeddings": 32768,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen2",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 28,
24
+ "num_key_value_heads": 8,
25
+ "qkv_bias": false,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_scaling": null,
28
+ "rope_theta": 1000000,
29
+ "sliding_window": null,
30
+ "tie_word_embeddings": true,
31
+ "torch_dtype": "bfloat16",
32
+ "transformers_version": "4.46.0",
33
+ "use_cache": true,
34
+ "use_qk_norm": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 152064
37
+ }
1B_gate_headwise/configuration_qwen3.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Lopyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
10
+ #
11
+ # Licensed under the Apache License, Version 2.0 (the "License");
12
+ # you may not use this file except in compliance with the License.
13
+ # You may obtain a copy of the License at
14
+ #
15
+ # http://www.apache.org/licenses/LICENSE-2.0
16
+ #
17
+ # Unless required by applicable law or agreed to in writing, software
18
+ # distributed under the License is distributed on an "AS IS" BASIS,
19
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
+ # See the License for the specific language governing permissions and
21
+ # limitations under the License.
22
+ """Qwen3 model configuration"""
23
+
24
+ from transformers.configuration_utils import PretrainedConfig
25
+ from transformers.modeling_rope_utils import rope_config_validation
26
+ from transformers.utils import logging
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class Qwen3Config(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
34
+ Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
35
+ with the defaults will yield a similar configuration to that of
36
+ Qwen3-8B-beta [Qwen/Qwen3-8B-beta](https://huggingface.co/Qwen/Qwen3-8B-beta).
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+
42
+ Args:
43
+ vocab_size (`int`, *optional*, defaults to 151936):
44
+ Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
45
+ `inputs_ids` passed when calling [`Qwen3Model`]
46
+ hidden_size (`int`, *optional*, defaults to 4096):
47
+ Dimension of the hidden representations.
48
+ intermediate_size (`int`, *optional*, defaults to 22016):
49
+ Dimension of the MLP representations.
50
+ num_hidden_layers (`int`, *optional*, defaults to 32):
51
+ Number of hidden layers in the Transformer encoder.
52
+ num_attention_heads (`int`, *optional*, defaults to 32):
53
+ Number of attention heads for each attention layer in the Transformer encoder.
54
+ num_key_value_heads (`int`, *optional*, defaults to 32):
55
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
56
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
57
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
58
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
59
+ by meanpooling all the original heads within that group. For more details checkout [this
60
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
+ The non-linear activation function (function or string) in the decoder.
63
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
64
+ The maximum sequence length that this model might ever be used with.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
73
+ Whether the model's input and output word embeddings should be tied.
74
+ rope_theta (`float`, *optional*, defaults to 10000.0):
75
+ The base period of the RoPE embeddings.
76
+ rope_scaling (`Dict`, *optional*):
77
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
78
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
79
+ accordingly.
80
+ Expected contents:
81
+ `rope_type` (`str`):
82
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
83
+ 'llama3'], with 'default' being the original RoPE implementation.
84
+ `factor` (`float`, *optional*):
85
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
86
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
87
+ original maximum pre-trained length.
88
+ `original_max_position_embeddings` (`int`, *optional*):
89
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
90
+ pretraining.
91
+ `attention_factor` (`float`, *optional*):
92
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
93
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
94
+ `factor` field to infer the suggested value.
95
+ `beta_fast` (`float`, *optional*):
96
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
97
+ ramp function. If unspecified, it defaults to 32.
98
+ `beta_slow` (`float`, *optional*):
99
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
100
+ ramp function. If unspecified, it defaults to 1.
101
+ `short_factor` (`List[float]`, *optional*):
102
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
103
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
104
+ size divided by the number of attention heads divided by 2
105
+ `long_factor` (`List[float]`, *optional*):
106
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
107
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
108
+ size divided by the number of attention heads divided by 2
109
+ `low_freq_factor` (`float`, *optional*):
110
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
111
+ `high_freq_factor` (`float`, *optional*):
112
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
113
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
114
+ Whether to use sliding window attention.
115
+ sliding_window (`int`, *optional*, defaults to 4096):
116
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
117
+ max_window_layers (`int`, *optional*, defaults to 28):
118
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
119
+ attention_bias (`bool`, *optional*, defaults to `False`):
120
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
121
+ attention_dropout (`float`, *optional*, defaults to 0.0):
122
+ The dropout ratio for the attention probabilities.
123
+ use_qk_norm (`bool`, *optional*, defaults to `False`):
124
+ Whether query and key in attention use norm
125
+ ```python
126
+ >>> from transformers import Qwen3Model, Qwen3Config
127
+
128
+ >>> # Initializing a Qwen3 style configuration
129
+ >>> configuration = Qwen3Config()
130
+
131
+ >>> # Initializing a model from the Qwen3-8B style configuration
132
+ >>> model = Qwen3Model(configuration)
133
+
134
+ >>> # Accessing the model configuration
135
+ >>> configuration = model.config
136
+ ```"""
137
+
138
+ model_type = "qwen3"
139
+ keys_to_ignore_at_inference = ["past_key_values"]
140
+
141
+ # Default tensor parallel plan for base model `Qwen3`
142
+ base_model_tp_plan = {
143
+ "layers.*.self_attn.q_proj": "colwise",
144
+ "layers.*.self_attn.k_proj": "colwise",
145
+ "layers.*.self_attn.v_proj": "colwise",
146
+ "layers.*.self_attn.o_proj": "rowwise",
147
+ "layers.*.mlp.gate_proj": "colwise",
148
+ "layers.*.mlp.up_proj": "colwise",
149
+ "layers.*.mlp.down_proj": "rowwise",
150
+ }
151
+
152
+ def __init__(
153
+ self,
154
+ vocab_size=151936,
155
+ hidden_size=4096,
156
+ intermediate_size=22016,
157
+ num_hidden_layers=32,
158
+ num_attention_heads=32,
159
+ num_key_value_heads=32,
160
+ head_dim=128,
161
+ hidden_act="silu",
162
+ max_position_embeddings=32768,
163
+ initializer_range=0.02,
164
+ rms_norm_eps=1e-6,
165
+ use_cache=True,
166
+ tie_word_embeddings=False,
167
+ rope_theta=10000.0,
168
+ rope_scaling=None,
169
+ use_sliding_window=False,
170
+ sliding_window=4096,
171
+ max_window_layers=28,
172
+ attention_bias=False,
173
+ attention_dropout=0.0,
174
+ use_qk_norm=True,
175
+ elementwise_attn_output_gate=False,
176
+ headwise_attn_output_gate=False,
177
+ **kwargs,
178
+ ):
179
+ self.vocab_size = vocab_size
180
+ self.max_position_embeddings = max_position_embeddings
181
+ self.hidden_size = hidden_size
182
+ self.intermediate_size = intermediate_size
183
+ self.num_hidden_layers = num_hidden_layers
184
+ self.num_attention_heads = num_attention_heads
185
+ self.head_dim = head_dim
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window if use_sliding_window else None
188
+ self.max_window_layers = max_window_layers
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.hidden_act = hidden_act
196
+ self.initializer_range = initializer_range
197
+ self.rms_norm_eps = rms_norm_eps
198
+ self.use_cache = use_cache
199
+ self.rope_theta = rope_theta
200
+ self.rope_scaling = rope_scaling
201
+ self.attention_bias = attention_bias
202
+ self.attention_dropout = attention_dropout
203
+ self.use_qk_norm = use_qk_norm
204
+
205
+ self.headwise_attn_output_gate = headwise_attn_output_gate
206
+ self.elementwise_attn_output_gate = elementwise_attn_output_gate
207
+
208
+ # Validate the correctness of rotary position embeddings parameters
209
+ # BC: if there is a 'type' field, move it to 'rope_type'.
210
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
211
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
212
+ rope_config_validation(self)
213
+
214
+ super().__init__(
215
+ tie_word_embeddings=tie_word_embeddings,
216
+ **kwargs,
217
+ )
218
+
1B_gate_headwise/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": false,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.37.0"
7
+ }
1B_gate_headwise/modeling_qwen3.py ADDED
@@ -0,0 +1,1539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch Qwen3 model."""
21
+
22
+ import math
23
+ from typing import List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
32
+ from transformers.generation import GenerationMixin
33
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
34
+ from transformers.modeling_outputs import (
35
+ BaseModelOutputWithPast,
36
+ CausalLMOutputWithPast,
37
+ QuestionAnsweringModelOutput,
38
+ SequenceClassifierOutputWithPast,
39
+ TokenClassifierOutput,
40
+ )
41
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
42
+ from transformers.modeling_utils import PreTrainedModel
43
+ from transformers.utils import (
44
+ add_code_sample_docstrings,
45
+ add_start_docstrings,
46
+ add_start_docstrings_to_model_forward,
47
+ is_flash_attn_2_available,
48
+ is_flash_attn_greater_or_equal_2_10,
49
+ logging,
50
+ replace_return_docstrings,
51
+ )
52
+ from .configuration_qwen3 import Qwen3Config
53
+
54
+ if is_flash_attn_2_available():
55
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "Qwen/Qwen3-8B"
60
+ _CONFIG_FOR_DOC = "Qwen3Config"
61
+
62
+
63
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen3
64
+ class Qwen3RMSNorm(nn.Module):
65
+ def __init__(self, hidden_size, eps=1e-6):
66
+ """
67
+ Qwen3RMSNorm is equivalent to T5LayerNorm
68
+ """
69
+ super().__init__()
70
+ self.weight = nn.Parameter(torch.ones(hidden_size))
71
+ self.variance_epsilon = eps
72
+
73
+ def forward(self, hidden_states):
74
+ input_dtype = hidden_states.dtype
75
+ hidden_states = hidden_states.to(torch.float32)
76
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
77
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
78
+ return self.weight * hidden_states.to(input_dtype)
79
+
80
+ def extra_repr(self):
81
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
82
+
83
+
84
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen3
85
+ class Qwen3RotaryEmbedding(nn.Module):
86
+ def __init__(
87
+ self,
88
+ dim=None,
89
+ max_position_embeddings=2048,
90
+ base=10000,
91
+ device=None,
92
+ scaling_factor=1.0,
93
+ rope_type="default",
94
+ config: Optional[Qwen3Config] = None,
95
+ ):
96
+ super().__init__()
97
+ # TODO (joao): remove the `if` below, only used for BC
98
+ self.rope_kwargs = {}
99
+ if config is None:
100
+ logger.warning_once(
101
+ "`Qwen3RotaryEmbedding` can now be fully parameterized by passing the model config through the "
102
+ "`config` argument. All other arguments will be removed in v4.46"
103
+ )
104
+ self.rope_kwargs = {
105
+ "rope_type": rope_type,
106
+ "factor": scaling_factor,
107
+ "dim": dim,
108
+ "base": base,
109
+ "max_position_embeddings": max_position_embeddings,
110
+ }
111
+ self.rope_type = rope_type
112
+ self.max_seq_len_cached = max_position_embeddings
113
+ self.original_max_seq_len = max_position_embeddings
114
+ else:
115
+ # BC: "rope_type" was originally "type"
116
+ if config.rope_scaling is not None:
117
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
118
+ else:
119
+ self.rope_type = "default"
120
+ self.max_seq_len_cached = config.max_position_embeddings
121
+ self.original_max_seq_len = config.max_position_embeddings
122
+
123
+ self.config = config
124
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
125
+
126
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
127
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
128
+ self.original_inv_freq = self.inv_freq
129
+
130
+ def _dynamic_frequency_update(self, position_ids, device):
131
+ """
132
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
133
+ 1 - growing beyond the cached sequence length (allow scaling)
134
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
135
+ """
136
+ seq_len = torch.max(position_ids) + 1
137
+ if seq_len > self.max_seq_len_cached: # growth
138
+ inv_freq, self.attention_scaling = self.rope_init_fn(
139
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
140
+ )
141
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
142
+ self.max_seq_len_cached = seq_len
143
+
144
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
145
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
146
+ self.max_seq_len_cached = self.original_max_seq_len
147
+
148
+ @torch.no_grad()
149
+ def forward(self, x, position_ids):
150
+ if "dynamic" in self.rope_type:
151
+ self._dynamic_frequency_update(position_ids, device=x.device)
152
+
153
+ # Core RoPE block
154
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
155
+ position_ids_expanded = position_ids[:, None, :].float()
156
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
157
+ device_type = x.device.type
158
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
159
+ with torch.autocast(device_type=device_type, enabled=False):
160
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
161
+ emb = torch.cat((freqs, freqs), dim=-1)
162
+ cos = emb.cos()
163
+ sin = emb.sin()
164
+
165
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
166
+ cos = cos * self.attention_scaling
167
+ sin = sin * self.attention_scaling
168
+
169
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
170
+
171
+
172
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
173
+ def rotate_half(x):
174
+ """Rotates half the hidden dims of the input."""
175
+ x1 = x[..., : x.shape[-1] // 2]
176
+ x2 = x[..., x.shape[-1] // 2:]
177
+ return torch.cat((-x2, x1), dim=-1)
178
+
179
+
180
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
181
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
182
+ """Applies Rotary Position Embedding to the query and key tensors.
183
+
184
+ Args:
185
+ q (`torch.Tensor`): The query tensor.
186
+ k (`torch.Tensor`): The key tensor.
187
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
188
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
189
+ position_ids (`torch.Tensor`, *optional*):
190
+ Deprecated and unused.
191
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
192
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
193
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
194
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
195
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
196
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
197
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
198
+ Returns:
199
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
200
+ """
201
+ cos = cos.unsqueeze(unsqueeze_dim)
202
+ sin = sin.unsqueeze(unsqueeze_dim)
203
+ q_embed = (q * cos) + (rotate_half(q) * sin)
204
+ k_embed = (k * cos) + (rotate_half(k) * sin)
205
+ return q_embed, k_embed
206
+
207
+
208
+ # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen3
209
+ class Qwen3MLP(nn.Module):
210
+ def __init__(self, config):
211
+ super().__init__()
212
+ self.hidden_size = config.hidden_size
213
+ self.intermediate_size = config.intermediate_size
214
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
215
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
216
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
217
+ self.act_fn = ACT2FN[config.hidden_act]
218
+
219
+ def forward(self, hidden_state):
220
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
221
+
222
+
223
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
224
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
225
+ """
226
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
227
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
228
+ """
229
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
230
+ if n_rep == 1:
231
+ return hidden_states
232
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
233
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
234
+
235
+
236
+ class Qwen3Attention(nn.Module):
237
+ """
238
+ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
239
+ and "Generating Long Sequences with Sparse Transformers".
240
+ """
241
+
242
+ def __init__(self, config: Qwen3Config, layer_idx: Optional[int] = None):
243
+ super().__init__()
244
+ self.config = config
245
+ self.layer_idx = layer_idx
246
+ if layer_idx is None:
247
+ logger.warning_once(
248
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
249
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
250
+ "when creating this class."
251
+ )
252
+
253
+ self.hidden_size = config.hidden_size
254
+ self.num_heads = config.num_attention_heads
255
+ self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
256
+ self.num_key_value_heads = config.num_key_value_heads
257
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
258
+ self.max_position_embeddings = config.max_position_embeddings
259
+ self.rope_theta = config.rope_theta
260
+ self.is_causal = True
261
+ self.attention_dropout = config.attention_dropout
262
+ self.use_qk_norm = config.use_qk_norm
263
+ self.headwise_attn_output_gate = config.headwise_attn_output_gate
264
+ self.elementwise_attn_output_gate = config.elementwise_attn_output_gate
265
+
266
+ # if (self.head_dim * self.num_heads) != self.hidden_size:
267
+ # raise ValueError(
268
+ # f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
269
+ # f" and `num_heads`: {self.num_heads})."
270
+ # )
271
+ if self.headwise_attn_output_gate:
272
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim + self.num_heads, bias=config.qkv_bias)
273
+ elif self.elementwise_attn_output_gate:
274
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim * 2, bias=config.qkv_bias)
275
+ else:
276
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.qkv_bias)
277
+
278
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
279
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.qkv_bias)
280
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.qkv_bias)
281
+ if self.use_qk_norm:
282
+ self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
283
+ self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)
284
+
285
+ self.rotary_emb = Qwen3RotaryEmbedding(config=self.config)
286
+
287
+ def forward(
288
+ self,
289
+ hidden_states: torch.Tensor,
290
+ attention_mask: Optional[torch.Tensor] = None,
291
+ position_ids: Optional[torch.LongTensor] = None,
292
+ past_key_value: Optional[Cache] = None,
293
+ output_attentions: bool = False,
294
+ use_cache: bool = False,
295
+ cache_position: Optional[torch.LongTensor] = None,
296
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
297
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
298
+ bsz, q_len, _ = hidden_states.size()
299
+
300
+ query_states = self.q_proj(hidden_states)
301
+ key_states = self.k_proj(hidden_states)
302
+ value_states = self.v_proj(hidden_states)
303
+
304
+ if self.headwise_attn_output_gate:
305
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
306
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
307
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
308
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
309
+ elif self.elementwise_attn_output_gate:
310
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
311
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
312
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
313
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
314
+ else:
315
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
316
+
317
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
318
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
319
+
320
+
321
+ if self.use_qk_norm:
322
+ query_states = self.q_norm(query_states)
323
+ key_states = self.k_norm(key_states)
324
+
325
+ cos, sin = position_embeddings
326
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
327
+
328
+ if past_key_value is not None:
329
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
330
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
331
+
332
+ # repeat k/v heads if n_kv_heads < n_heads
333
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
334
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
335
+
336
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
337
+ if attention_mask is not None: # no matter the length, we just slice it
338
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
339
+ attn_weights = attn_weights + causal_mask
340
+
341
+ # upcast attention to fp32
342
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
343
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
344
+
345
+ attn_output = torch.matmul(attn_weights, value_states)
346
+
347
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
348
+ raise ValueError(
349
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
350
+ f" {attn_output.size()}"
351
+ )
352
+
353
+ attn_output = attn_output.transpose(1, 2).contiguous()
354
+
355
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
356
+ attn_output = attn_output * torch.sigmoid(gate_score)
357
+
358
+ attn_output = attn_output.reshape(bsz, q_len, -1)
359
+
360
+ attn_output = self.o_proj(attn_output)
361
+
362
+ if not output_attentions:
363
+ attn_weights = None
364
+
365
+ return attn_output, attn_weights, past_key_value
366
+
367
+
368
+ class Qwen3FlashAttention2(Qwen3Attention):
369
+ """
370
+ Qwen3 flash attention module, following Qwen3 attention module. This module inherits from `Qwen3Attention`
371
+ as the weights of the module stays untouched. The only required change would be on the forward pass
372
+ where it needs to correctly call the public API of flash attention and deal with padding tokens
373
+ in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
374
+ config.max_window_layers layers.
375
+ """
376
+
377
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
378
+ def __init__(self, *args, **kwargs):
379
+ super().__init__(*args, **kwargs)
380
+
381
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
382
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
383
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
384
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
385
+
386
+ def forward(
387
+ self,
388
+ hidden_states: torch.Tensor,
389
+ attention_mask: Optional[torch.Tensor] = None,
390
+ position_ids: Optional[torch.LongTensor] = None,
391
+ past_key_value: Optional[Cache] = None,
392
+ output_attentions: bool = False,
393
+ use_cache: bool = False,
394
+ cache_position: Optional[torch.LongTensor] = None,
395
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
396
+ ):
397
+ bsz, q_len, _ = hidden_states.size()
398
+
399
+ query_states = self.q_proj(hidden_states)
400
+ key_states = self.k_proj(hidden_states)
401
+ value_states = self.v_proj(hidden_states)
402
+
403
+ if self.headwise_attn_output_gate:
404
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
405
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
406
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
407
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
408
+ elif self.elementwise_attn_output_gate:
409
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
410
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
411
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
412
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
413
+ else:
414
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
415
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
416
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
417
+
418
+ if self.use_qk_norm:
419
+ query_states = self.q_norm(query_states)
420
+ key_states = self.k_norm(key_states)
421
+
422
+ cos, sin = position_embeddings
423
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
424
+
425
+ if past_key_value is not None:
426
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
427
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
428
+
429
+ # repeat k/v heads if n_kv_heads < n_heads
430
+ # key_states = repeat_kv(key_states, self.num_key_value_groups)
431
+ # value_states = repeat_kv(value_states, self.num_key_value_groups)
432
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
433
+
434
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
435
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
436
+ # cast them back in float16 just to be sure everything works as expected.
437
+ input_dtype = query_states.dtype
438
+ if input_dtype == torch.float32:
439
+ if torch.is_autocast_enabled():
440
+ target_dtype = torch.get_autocast_gpu_dtype()
441
+ # Handle the case where the model is quantized
442
+ elif hasattr(self.config, "_pre_quantization_dtype"):
443
+ target_dtype = self.config._pre_quantization_dtype
444
+ else:
445
+ target_dtype = self.q_proj.weight.dtype
446
+
447
+ logger.warning_once(
448
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
449
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
450
+ f" {target_dtype}."
451
+ )
452
+
453
+ query_states = query_states.to(target_dtype)
454
+ key_states = key_states.to(target_dtype)
455
+ value_states = value_states.to(target_dtype)
456
+
457
+ # Reashape to the expected shape for Flash Attention
458
+ query_states = query_states.transpose(1, 2)
459
+ key_states = key_states.transpose(1, 2)
460
+ value_states = value_states.transpose(1, 2)
461
+
462
+ if (
463
+ self.config.use_sliding_window
464
+ and getattr(self.config, "sliding_window", None) is not None
465
+ and self.layer_idx >= self.config.max_window_layers
466
+ ):
467
+ sliding_window = self.config.sliding_window
468
+ else:
469
+ sliding_window = None
470
+ attn_output = _flash_attention_forward(
471
+ query_states,
472
+ key_states,
473
+ value_states,
474
+ attention_mask,
475
+ q_len,
476
+ position_ids=position_ids,
477
+ dropout=dropout_rate,
478
+ sliding_window=sliding_window,
479
+ is_causal=self.is_causal,
480
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
481
+ )
482
+
483
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
484
+ attn_output = attn_output * torch.sigmoid(gate_score)
485
+
486
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
487
+ attn_output = self.o_proj(attn_output)
488
+ if not output_attentions:
489
+ attn_weights = None
490
+
491
+ return attn_output, attn_weights, past_key_value
492
+
493
+
494
+
495
+ class Qwen3SdpaAttention(Qwen3Attention):
496
+ """
497
+ Qwen3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
498
+ `Qwen3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
499
+ SDPA API.
500
+ """
501
+
502
+ # Adapted from Qwen3Attention.forward
503
+ def forward(
504
+ self,
505
+ hidden_states: torch.Tensor,
506
+ attention_mask: Optional[torch.Tensor] = None,
507
+ position_ids: Optional[torch.LongTensor] = None,
508
+ past_key_value: Optional[Cache] = None,
509
+ output_attentions: bool = False,
510
+ use_cache: bool = False,
511
+ cache_position: Optional[torch.LongTensor] = None,
512
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
513
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
514
+ if output_attentions:
515
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
516
+ logger.warning_once(
517
+ "Qwen3Model is using Qwen3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
518
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
519
+ )
520
+ return super().forward(
521
+ hidden_states=hidden_states,
522
+ attention_mask=attention_mask,
523
+ position_ids=position_ids,
524
+ past_key_value=past_key_value,
525
+ output_attentions=output_attentions,
526
+ use_cache=use_cache,
527
+ cache_position=cache_position,
528
+ position_embeddings=position_embeddings,
529
+ )
530
+
531
+ bsz, q_len, _ = hidden_states.size()
532
+
533
+ query_states = self.q_proj(hidden_states)
534
+ key_states = self.k_proj(hidden_states)
535
+ value_states = self.v_proj(hidden_states)
536
+
537
+ if self.headwise_attn_output_gate:
538
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
539
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.num_key_value_groups], dim=-1)
540
+ gate_score = gate_score.reshape(bsz, q_len, -1, 1)
541
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
542
+ elif self.elementwise_attn_output_gate:
543
+ query_states = query_states.view(bsz, q_len, self.num_key_value_heads, -1)
544
+ query_states, gate_score = torch.split(query_states, [self.head_dim * self.num_key_value_groups, self.head_dim * self.num_key_value_groups], dim=-1)
545
+ gate_score = gate_score.reshape(bsz, q_len, -1, self.head_dim)
546
+ query_states = query_states.reshape(bsz, q_len, -1, self.head_dim).transpose(1, 2)
547
+ else:
548
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
549
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
550
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
551
+
552
+ if self.use_qk_norm:
553
+ query_states = self.q_norm(query_states)
554
+ key_states = self.k_norm(key_states)
555
+
556
+ cos, sin = position_embeddings
557
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
558
+
559
+ if past_key_value is not None:
560
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models
561
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
562
+
563
+ # key_states: bs, head, q_len, head_dim
564
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
565
+
566
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
567
+
568
+ causal_mask = attention_mask
569
+ if attention_mask is not None: # no matter the length, we just slice it
570
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
571
+
572
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
573
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
574
+ if query_states.device.type == "cuda" and attention_mask is not None:
575
+ query_states = query_states.contiguous()
576
+ key_states = key_states.contiguous()
577
+ value_states = value_states.contiguous()
578
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
579
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
580
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
581
+ is_causal = True if causal_mask is None and q_len > 1 else False
582
+
583
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
584
+ query_states,
585
+ key_states,
586
+ value_states,
587
+ attn_mask=causal_mask,
588
+ dropout_p=self.attention_dropout if self.training else 0.0,
589
+ is_causal=is_causal,
590
+ )
591
+
592
+ attn_output = attn_output.transpose(1, 2).contiguous()
593
+
594
+ if self.headwise_attn_output_gate or self.elementwise_attn_output_gate:
595
+ attn_output = attn_output * torch.sigmoid(gate_score)
596
+
597
+ attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
598
+
599
+ attn_output = self.o_proj(attn_output)
600
+
601
+ return attn_output, None, past_key_value
602
+
603
+ QWEN3_ATTENTION_CLASSES = {
604
+ "eager": Qwen3Attention,
605
+ "flash_attention_2": Qwen3FlashAttention2,
606
+ "sdpa": Qwen3SdpaAttention,
607
+ }
608
+
609
+
610
+ class Qwen3DecoderLayer(nn.Module):
611
+ def __init__(self, config: Qwen3Config, layer_idx: int):
612
+ super().__init__()
613
+ self.hidden_size = config.hidden_size
614
+
615
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
616
+ logger.warning_once(
617
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
618
+ "unexpected results may be encountered."
619
+ )
620
+ self.self_attn = QWEN3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
621
+
622
+ self.mlp = Qwen3MLP(config)
623
+ self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
624
+ self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
625
+
626
+ def forward(
627
+ self,
628
+ hidden_states: torch.Tensor,
629
+ attention_mask: Optional[torch.Tensor] = None,
630
+ position_ids: Optional[torch.LongTensor] = None,
631
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
632
+ output_attentions: Optional[bool] = False,
633
+ use_cache: Optional[bool] = False,
634
+ cache_position: Optional[torch.LongTensor] = None,
635
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
636
+ **kwargs,
637
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
638
+ """
639
+ Args:
640
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
641
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
642
+ `(batch, sequence_length)` where padding elements are indicated by 0.
643
+ output_attentions (`bool`, *optional*):
644
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
645
+ returned tensors for more detail.
646
+ use_cache (`bool`, *optional*):
647
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
648
+ (see `past_key_values`).
649
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
650
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
651
+ Indices depicting the position of the input sequence tokens in the sequence.
652
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
653
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
654
+ with `head_dim` being the embedding dimension of each attention head.
655
+ kwargs (`dict`, *optional*):
656
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
657
+ into the model
658
+ """
659
+
660
+ residual = hidden_states
661
+
662
+ hidden_states = self.input_layernorm(hidden_states)
663
+
664
+ # Self Attention
665
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
666
+ hidden_states=hidden_states,
667
+ attention_mask=attention_mask,
668
+ position_ids=position_ids,
669
+ past_key_value=past_key_value,
670
+ output_attentions=output_attentions,
671
+ use_cache=use_cache,
672
+ cache_position=cache_position,
673
+ position_embeddings=position_embeddings,
674
+ )
675
+ hidden_states = residual + hidden_states
676
+
677
+ # Fully Connected
678
+ residual = hidden_states
679
+ hidden_states = self.post_attention_layernorm(hidden_states)
680
+ hidden_states = self.mlp(hidden_states)
681
+ hidden_states = residual + hidden_states
682
+
683
+ outputs = (hidden_states,)
684
+
685
+ if output_attentions:
686
+ outputs += (self_attn_weights,)
687
+
688
+ if use_cache:
689
+ outputs += (present_key_value,)
690
+
691
+ return outputs
692
+
693
+
694
+ QWEN3_START_DOCSTRING = r"""
695
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
696
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
697
+ etc.)
698
+
699
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
700
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
701
+ and behavior.
702
+
703
+ Parameters:
704
+ config ([`Qwen3Config`]):
705
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
706
+ load the weights associated with the model, only the configuration. Check out the
707
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
708
+ """
709
+
710
+
711
+ @add_start_docstrings(
712
+ "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
713
+ QWEN3_START_DOCSTRING,
714
+ )
715
+ class Qwen3PreTrainedModel(PreTrainedModel):
716
+ config_class = Qwen3Config
717
+ base_model_prefix = "model"
718
+ supports_gradient_checkpointing = True
719
+ _no_split_modules = ["Qwen3DecoderLayer"]
720
+ _skip_keys_device_placement = "past_key_values"
721
+ _supports_flash_attn_2 = True
722
+ _supports_sdpa = True
723
+ _supports_cache_class = True
724
+ _supports_quantized_cache = True
725
+ _supports_static_cache = True
726
+
727
+ def _init_weights(self, module):
728
+ std = self.config.initializer_range
729
+ if isinstance(module, nn.Linear):
730
+ module.weight.data.normal_(mean=0.0, std=std)
731
+ if module.bias is not None:
732
+ module.bias.data.zero_()
733
+ elif isinstance(module, nn.Embedding):
734
+ module.weight.data.normal_(mean=0.0, std=std)
735
+ if module.padding_idx is not None:
736
+ module.weight.data[module.padding_idx].zero_()
737
+
738
+
739
+ QWEN3_INPUTS_DOCSTRING = r"""
740
+ Args:
741
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
742
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
743
+ it.
744
+
745
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
746
+ [`PreTrainedTokenizer.__call__`] for details.
747
+
748
+ [What are input IDs?](../glossary#input-ids)
749
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
750
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
751
+
752
+ - 1 for tokens that are **not masked**,
753
+ - 0 for tokens that are **masked**.
754
+
755
+ [What are attention masks?](../glossary#attention-mask)
756
+
757
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
758
+ [`PreTrainedTokenizer.__call__`] for details.
759
+
760
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
761
+ `past_key_values`).
762
+
763
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
764
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
765
+ information on the default strategy.
766
+
767
+ - 1 indicates the head is **not masked**,
768
+ - 0 indicates the head is **masked**.
769
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
770
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
771
+ config.n_positions - 1]`.
772
+
773
+ [What are position IDs?](../glossary#position-ids)
774
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
775
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
776
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
777
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
778
+
779
+ Two formats are allowed:
780
+ - a [`~cache_utils.Cache`] instance, see our
781
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
782
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
783
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
784
+ cache format.
785
+
786
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
787
+ legacy cache format will be returned.
788
+
789
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
790
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
791
+ of shape `(batch_size, sequence_length)`.
792
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
793
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
794
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
795
+ model's internal embedding lookup matrix.
796
+ use_cache (`bool`, *optional*):
797
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
798
+ `past_key_values`).
799
+ output_attentions (`bool`, *optional*):
800
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
801
+ tensors for more detail.
802
+ output_hidden_states (`bool`, *optional*):
803
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
804
+ more detail.
805
+ return_dict (`bool`, *optional*):
806
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
807
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
808
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
809
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
810
+ the complete sequence length.
811
+ """
812
+
813
+
814
+ @add_start_docstrings(
815
+ "The bare Qwen3 Model outputting raw hidden-states without any specific head on top.",
816
+ QWEN3_START_DOCSTRING,
817
+ )
818
+ class Qwen3Model(Qwen3PreTrainedModel):
819
+ """
820
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
821
+
822
+ Args:
823
+ config: Qwen3Config
824
+ """
825
+
826
+ def __init__(self, config: Qwen3Config):
827
+ super().__init__(config)
828
+ self.padding_idx = config.pad_token_id
829
+ self.vocab_size = config.vocab_size
830
+
831
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
832
+ self.layers = nn.ModuleList(
833
+ [Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
834
+ )
835
+ self._attn_implementation = config._attn_implementation
836
+ self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
837
+ self.rotary_emb = Qwen3RotaryEmbedding(config=config)
838
+
839
+ self.gradient_checkpointing = False
840
+ # Initialize weights and apply final processing
841
+ self.post_init()
842
+
843
+ def get_input_embeddings(self):
844
+ return self.embed_tokens
845
+
846
+ def set_input_embeddings(self, value):
847
+ self.embed_tokens = value
848
+
849
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
850
+ def forward(
851
+ self,
852
+ input_ids: torch.LongTensor = None,
853
+ attention_mask: Optional[torch.Tensor] = None,
854
+ position_ids: Optional[torch.LongTensor] = None,
855
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
856
+ inputs_embeds: Optional[torch.FloatTensor] = None,
857
+ use_cache: Optional[bool] = None,
858
+ output_attentions: Optional[bool] = None,
859
+ output_hidden_states: Optional[bool] = None,
860
+ return_dict: Optional[bool] = None,
861
+ cache_position: Optional[torch.LongTensor] = None,
862
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
863
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
864
+ output_hidden_states = (
865
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
866
+ )
867
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
868
+
869
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
870
+
871
+ if (input_ids is None) ^ (inputs_embeds is not None):
872
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
873
+
874
+ if self.gradient_checkpointing and self.training:
875
+ if use_cache:
876
+ logger.warning_once(
877
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
878
+ )
879
+ use_cache = False
880
+
881
+ # kept for BC (non `Cache` `past_key_values` inputs)
882
+ return_legacy_cache = False
883
+ if use_cache and not isinstance(past_key_values, Cache):
884
+ return_legacy_cache = True
885
+ if past_key_values is None:
886
+ past_key_values = DynamicCache()
887
+ else:
888
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
889
+ logger.warning_once(
890
+ "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
891
+ "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
892
+ "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
893
+ )
894
+
895
+ if inputs_embeds is None:
896
+ inputs_embeds = self.embed_tokens(input_ids)
897
+
898
+ if cache_position is None:
899
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
900
+ cache_position = torch.arange(
901
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
902
+ )
903
+ if position_ids is None:
904
+ position_ids = cache_position.unsqueeze(0)
905
+
906
+ causal_mask = self._update_causal_mask(
907
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
908
+ )
909
+
910
+ hidden_states = inputs_embeds
911
+
912
+ # create position embeddings to be shared across the decoder layers
913
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
914
+
915
+ # decoder layers
916
+ all_hidden_states = () if output_hidden_states else None
917
+ all_self_attns = () if output_attentions else None
918
+ next_decoder_cache = None
919
+
920
+ for decoder_layer in self.layers:
921
+ if output_hidden_states:
922
+ all_hidden_states += (hidden_states,)
923
+
924
+ if self.gradient_checkpointing and self.training:
925
+ layer_outputs = self._gradient_checkpointing_func(
926
+ decoder_layer.__call__,
927
+ hidden_states,
928
+ causal_mask,
929
+ position_ids,
930
+ past_key_values,
931
+ output_attentions,
932
+ use_cache,
933
+ cache_position,
934
+ position_embeddings,
935
+ )
936
+ else:
937
+ layer_outputs = decoder_layer(
938
+ hidden_states,
939
+ attention_mask=causal_mask,
940
+ position_ids=position_ids,
941
+ past_key_value=past_key_values,
942
+ output_attentions=output_attentions,
943
+ use_cache=use_cache,
944
+ cache_position=cache_position,
945
+ position_embeddings=position_embeddings,
946
+ )
947
+
948
+ hidden_states = layer_outputs[0]
949
+
950
+ if use_cache:
951
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
952
+
953
+ if output_attentions:
954
+ all_self_attns += (layer_outputs[1],)
955
+
956
+ hidden_states = self.norm(hidden_states)
957
+
958
+ # add hidden states from the last decoder layer
959
+ if output_hidden_states:
960
+ all_hidden_states += (hidden_states,)
961
+
962
+ next_cache = next_decoder_cache if use_cache else None
963
+ if return_legacy_cache:
964
+ next_cache = next_cache.to_legacy_cache()
965
+
966
+ if not return_dict:
967
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
968
+ return BaseModelOutputWithPast(
969
+ last_hidden_state=hidden_states,
970
+ past_key_values=next_cache,
971
+ hidden_states=all_hidden_states,
972
+ attentions=all_self_attns,
973
+ )
974
+
975
+ # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
976
+ def _update_causal_mask(
977
+ self,
978
+ attention_mask: torch.Tensor,
979
+ input_tensor: torch.Tensor,
980
+ cache_position: torch.Tensor,
981
+ past_key_values: Cache,
982
+ output_attentions: bool,
983
+ ):
984
+ if self.config._attn_implementation == "flash_attention_2":
985
+ if attention_mask is not None and 0.0 in attention_mask:
986
+ return attention_mask
987
+ return None
988
+
989
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
990
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
991
+ # to infer the attention mask.
992
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
993
+ using_static_cache = isinstance(past_key_values, StaticCache)
994
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
995
+
996
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
997
+ if (
998
+ self.config._attn_implementation == "sdpa"
999
+ and not (using_static_cache or using_sliding_window_cache)
1000
+ and not output_attentions
1001
+ ):
1002
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1003
+ attention_mask,
1004
+ inputs_embeds=input_tensor,
1005
+ past_key_values_length=past_seen_tokens,
1006
+ sliding_window=self.config.sliding_window,
1007
+ is_training=self.training,
1008
+ ):
1009
+ return None
1010
+
1011
+ dtype, device = input_tensor.dtype, input_tensor.device
1012
+ min_dtype = torch.finfo(dtype).min
1013
+ sequence_length = input_tensor.shape[1]
1014
+ # SlidingWindowCache or StaticCache
1015
+ if using_sliding_window_cache or using_static_cache:
1016
+ target_length = past_key_values.get_max_cache_shape()
1017
+ # DynamicCache or no cache
1018
+ else:
1019
+ target_length = (
1020
+ attention_mask.shape[-1]
1021
+ if isinstance(attention_mask, torch.Tensor)
1022
+ else past_seen_tokens + sequence_length + 1
1023
+ )
1024
+
1025
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1026
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
1027
+ attention_mask,
1028
+ sequence_length=sequence_length,
1029
+ target_length=target_length,
1030
+ dtype=dtype,
1031
+ device=device,
1032
+ cache_position=cache_position,
1033
+ batch_size=input_tensor.shape[0],
1034
+ config=self.config,
1035
+ past_key_values=past_key_values,
1036
+ )
1037
+
1038
+ if (
1039
+ self.config._attn_implementation == "sdpa"
1040
+ and attention_mask is not None
1041
+ and attention_mask.device.type == "cuda"
1042
+ and not output_attentions
1043
+ ):
1044
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1045
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1046
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1047
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1048
+
1049
+ return causal_mask
1050
+
1051
+ @staticmethod
1052
+ # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen3
1053
+ def _prepare_4d_causal_attention_mask_with_cache_position(
1054
+ attention_mask: torch.Tensor,
1055
+ sequence_length: int,
1056
+ target_length: int,
1057
+ dtype: torch.dtype,
1058
+ device: torch.device,
1059
+ cache_position: torch.Tensor,
1060
+ batch_size: int,
1061
+ config: Qwen3Config,
1062
+ past_key_values: Cache,
1063
+ ):
1064
+ """
1065
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1066
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1067
+
1068
+ Args:
1069
+ attention_mask (`torch.Tensor`):
1070
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1071
+ sequence_length (`int`):
1072
+ The sequence length being processed.
1073
+ target_length (`int`):
1074
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1075
+ dtype (`torch.dtype`):
1076
+ The dtype to use for the 4D attention mask.
1077
+ device (`torch.device`):
1078
+ The device to plcae the 4D attention mask on.
1079
+ cache_position (`torch.Tensor`):
1080
+ Indices depicting the position of the input sequence tokens in the sequence.
1081
+ batch_size (`torch.Tensor`):
1082
+ Batch size.
1083
+ config (`Qwen3Config`):
1084
+ The model's configuration class
1085
+ past_key_values (`Cache`):
1086
+ The cache class that is being used currently to generate
1087
+ """
1088
+ if attention_mask is not None and attention_mask.dim() == 4:
1089
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1090
+ causal_mask = attention_mask
1091
+ else:
1092
+ min_dtype = torch.finfo(dtype).min
1093
+ causal_mask = torch.full(
1094
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1095
+ )
1096
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1097
+ if config.sliding_window is not None:
1098
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1099
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
1100
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1101
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
1102
+ cache_position.reshape(-1, 1) - config.sliding_window
1103
+ )
1104
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1105
+ causal_mask *= diagonal_attend_mask
1106
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
1107
+ if attention_mask is not None:
1108
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1109
+ if attention_mask.shape[-1] > target_length:
1110
+ attention_mask = attention_mask[:, :target_length]
1111
+ mask_length = attention_mask.shape[-1]
1112
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1113
+ padding_mask = padding_mask == 0
1114
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1115
+ padding_mask, min_dtype
1116
+ )
1117
+ return causal_mask
1118
+
1119
+
1120
+ class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin):
1121
+ _tied_weights_keys = ["lm_head.weight"]
1122
+ _tp_plan = {"lm_head": "colwise_rep"}
1123
+
1124
+ def __init__(self, config):
1125
+ super().__init__(config)
1126
+ self.model = Qwen3Model(config)
1127
+ self.vocab_size = config.vocab_size
1128
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1129
+
1130
+ # Initialize weights and apply final processing
1131
+ self.post_init()
1132
+
1133
+ def get_input_embeddings(self):
1134
+ return self.model.embed_tokens
1135
+
1136
+ def set_input_embeddings(self, value):
1137
+ self.model.embed_tokens = value
1138
+
1139
+ def get_output_embeddings(self):
1140
+ return self.lm_head
1141
+
1142
+ def set_output_embeddings(self, new_embeddings):
1143
+ self.lm_head = new_embeddings
1144
+
1145
+ def set_decoder(self, decoder):
1146
+ self.model = decoder
1147
+
1148
+ def get_decoder(self):
1149
+ return self.model
1150
+
1151
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1152
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1153
+ def forward(
1154
+ self,
1155
+ input_ids: torch.LongTensor = None,
1156
+ attention_mask: Optional[torch.Tensor] = None,
1157
+ position_ids: Optional[torch.LongTensor] = None,
1158
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1159
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1160
+ labels: Optional[torch.LongTensor] = None,
1161
+ use_cache: Optional[bool] = None,
1162
+ output_attentions: Optional[bool] = None,
1163
+ output_hidden_states: Optional[bool] = None,
1164
+ return_dict: Optional[bool] = None,
1165
+ cache_position: Optional[torch.LongTensor] = None,
1166
+ num_logits_to_keep: int = 0,
1167
+ **loss_kwargs,
1168
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1169
+ r"""
1170
+ Args:
1171
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1172
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1173
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1174
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1175
+
1176
+ num_logits_to_keep (`int`, *optional*):
1177
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1178
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1179
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1180
+
1181
+ Returns:
1182
+
1183
+ Example:
1184
+
1185
+ ```python
1186
+ >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
1187
+
1188
+ >>> model = Qwen3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1189
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1190
+
1191
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1192
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1193
+
1194
+ >>> # Generate
1195
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1196
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1197
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1198
+ ```"""
1199
+
1200
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1201
+ output_hidden_states = (
1202
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1203
+ )
1204
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1205
+
1206
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1207
+ outputs = self.model(
1208
+ input_ids=input_ids,
1209
+ attention_mask=attention_mask,
1210
+ position_ids=position_ids,
1211
+ past_key_values=past_key_values,
1212
+ inputs_embeds=inputs_embeds,
1213
+ use_cache=use_cache,
1214
+ output_attentions=output_attentions,
1215
+ output_hidden_states=output_hidden_states,
1216
+ return_dict=return_dict,
1217
+ cache_position=cache_position,
1218
+ )
1219
+
1220
+ hidden_states = outputs[0]
1221
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1222
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
1223
+
1224
+ loss = None
1225
+ if labels is not None:
1226
+ loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
1227
+
1228
+ if not return_dict:
1229
+ output = (logits,) + outputs[1:]
1230
+ return (loss,) + output if loss is not None else output
1231
+
1232
+ return CausalLMOutputWithPast(
1233
+ loss=loss,
1234
+ logits=logits,
1235
+ past_key_values=outputs.past_key_values,
1236
+ hidden_states=outputs.hidden_states,
1237
+ attentions=outputs.attentions,
1238
+ )
1239
+
1240
+
1241
+ @add_start_docstrings(
1242
+ """
1243
+ The Qwen3 Model transformer with a sequence classification head on top (linear layer).
1244
+
1245
+ [`Qwen3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1246
+ (e.g. GPT-2) do.
1247
+
1248
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1249
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1250
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1251
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1252
+ each row of the batch).
1253
+ """,
1254
+ QWEN3_START_DOCSTRING,
1255
+ )
1256
+ class Qwen3ForSequenceClassification(Qwen3PreTrainedModel):
1257
+ def __init__(self, config):
1258
+ super().__init__(config)
1259
+ self.num_labels = config.num_labels
1260
+ self.model = Qwen3Model(config)
1261
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1262
+
1263
+ # Initialize weights and apply final processing
1264
+ self.post_init()
1265
+
1266
+ def get_input_embeddings(self):
1267
+ return self.model.embed_tokens
1268
+
1269
+ def set_input_embeddings(self, value):
1270
+ self.model.embed_tokens = value
1271
+
1272
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1273
+ def forward(
1274
+ self,
1275
+ input_ids: torch.LongTensor = None,
1276
+ attention_mask: Optional[torch.Tensor] = None,
1277
+ position_ids: Optional[torch.LongTensor] = None,
1278
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1279
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1280
+ labels: Optional[torch.LongTensor] = None,
1281
+ use_cache: Optional[bool] = None,
1282
+ output_attentions: Optional[bool] = None,
1283
+ output_hidden_states: Optional[bool] = None,
1284
+ return_dict: Optional[bool] = None,
1285
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1286
+ r"""
1287
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1288
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1289
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1290
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1291
+ """
1292
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1293
+
1294
+ transformer_outputs = self.model(
1295
+ input_ids,
1296
+ attention_mask=attention_mask,
1297
+ position_ids=position_ids,
1298
+ past_key_values=past_key_values,
1299
+ inputs_embeds=inputs_embeds,
1300
+ use_cache=use_cache,
1301
+ output_attentions=output_attentions,
1302
+ output_hidden_states=output_hidden_states,
1303
+ return_dict=return_dict,
1304
+ )
1305
+ hidden_states = transformer_outputs[0]
1306
+ logits = self.score(hidden_states)
1307
+
1308
+ if input_ids is not None:
1309
+ batch_size = input_ids.shape[0]
1310
+ else:
1311
+ batch_size = inputs_embeds.shape[0]
1312
+
1313
+ if self.config.pad_token_id is None and batch_size != 1:
1314
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1315
+ if self.config.pad_token_id is None:
1316
+ sequence_lengths = -1
1317
+ else:
1318
+ if input_ids is not None:
1319
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1320
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1321
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1322
+ sequence_lengths = sequence_lengths.to(logits.device)
1323
+ else:
1324
+ sequence_lengths = -1
1325
+
1326
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1327
+
1328
+ loss = None
1329
+ if labels is not None:
1330
+ labels = labels.to(logits.device)
1331
+ if self.config.problem_type is None:
1332
+ if self.num_labels == 1:
1333
+ self.config.problem_type = "regression"
1334
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1335
+ self.config.problem_type = "single_label_classification"
1336
+ else:
1337
+ self.config.problem_type = "multi_label_classification"
1338
+
1339
+ if self.config.problem_type == "regression":
1340
+ loss_fct = MSELoss()
1341
+ if self.num_labels == 1:
1342
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1343
+ else:
1344
+ loss = loss_fct(pooled_logits, labels)
1345
+ elif self.config.problem_type == "single_label_classification":
1346
+ loss_fct = CrossEntropyLoss()
1347
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1348
+ elif self.config.problem_type == "multi_label_classification":
1349
+ loss_fct = BCEWithLogitsLoss()
1350
+ loss = loss_fct(pooled_logits, labels)
1351
+ if not return_dict:
1352
+ output = (pooled_logits,) + transformer_outputs[1:]
1353
+ return ((loss,) + output) if loss is not None else output
1354
+
1355
+ return SequenceClassifierOutputWithPast(
1356
+ loss=loss,
1357
+ logits=pooled_logits,
1358
+ past_key_values=transformer_outputs.past_key_values,
1359
+ hidden_states=transformer_outputs.hidden_states,
1360
+ attentions=transformer_outputs.attentions,
1361
+ )
1362
+
1363
+
1364
+ @add_start_docstrings(
1365
+ """
1366
+ The Qwen3 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1367
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1368
+ """,
1369
+ QWEN3_START_DOCSTRING,
1370
+ )
1371
+ # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen3, LLAMA->QWEN3
1372
+ class Qwen3ForTokenClassification(Qwen3PreTrainedModel):
1373
+ def __init__(self, config):
1374
+ super().__init__(config)
1375
+ self.num_labels = config.num_labels
1376
+ self.model = Qwen3Model(config)
1377
+ if getattr(config, "classifier_dropout", None) is not None:
1378
+ classifier_dropout = config.classifier_dropout
1379
+ elif getattr(config, "hidden_dropout", None) is not None:
1380
+ classifier_dropout = config.hidden_dropout
1381
+ else:
1382
+ classifier_dropout = 0.1
1383
+ self.dropout = nn.Dropout(classifier_dropout)
1384
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1385
+
1386
+ # Initialize weights and apply final processing
1387
+ self.post_init()
1388
+
1389
+ def get_input_embeddings(self):
1390
+ return self.model.embed_tokens
1391
+
1392
+ def set_input_embeddings(self, value):
1393
+ self.model.embed_tokens = value
1394
+
1395
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1396
+ @add_code_sample_docstrings(
1397
+ checkpoint=_CHECKPOINT_FOR_DOC,
1398
+ output_type=TokenClassifierOutput,
1399
+ config_class=_CONFIG_FOR_DOC,
1400
+ )
1401
+ def forward(
1402
+ self,
1403
+ input_ids: Optional[torch.LongTensor] = None,
1404
+ attention_mask: Optional[torch.Tensor] = None,
1405
+ position_ids: Optional[torch.LongTensor] = None,
1406
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1407
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1408
+ labels: Optional[torch.LongTensor] = None,
1409
+ use_cache: Optional[bool] = None,
1410
+ output_attentions: Optional[bool] = None,
1411
+ output_hidden_states: Optional[bool] = None,
1412
+ return_dict: Optional[bool] = None,
1413
+ ) -> Union[Tuple, TokenClassifierOutput]:
1414
+ r"""
1415
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1416
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1417
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1418
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1419
+ """
1420
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1421
+
1422
+ outputs = self.model(
1423
+ input_ids,
1424
+ attention_mask=attention_mask,
1425
+ position_ids=position_ids,
1426
+ past_key_values=past_key_values,
1427
+ inputs_embeds=inputs_embeds,
1428
+ use_cache=use_cache,
1429
+ output_attentions=output_attentions,
1430
+ output_hidden_states=output_hidden_states,
1431
+ return_dict=return_dict,
1432
+ )
1433
+ sequence_output = outputs[0]
1434
+ sequence_output = self.dropout(sequence_output)
1435
+ logits = self.score(sequence_output)
1436
+
1437
+ loss = None
1438
+ if labels is not None:
1439
+ loss = self.loss_function(logits, labels, self.config)
1440
+
1441
+ if not return_dict:
1442
+ output = (logits,) + outputs[2:]
1443
+ return ((loss,) + output) if loss is not None else output
1444
+
1445
+ return TokenClassifierOutput(
1446
+ loss=loss,
1447
+ logits=logits,
1448
+ hidden_states=outputs.hidden_states,
1449
+ attentions=outputs.attentions,
1450
+ )
1451
+
1452
+
1453
+ @add_start_docstrings(
1454
+ """
1455
+ The Qwen3 Model transformer with a span classification head on top for extractive question-answering tasks like
1456
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1457
+ """,
1458
+ QWEN3_START_DOCSTRING,
1459
+ )
1460
+ # Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen3, MISTRAL->QWEN3
1461
+ class Qwen3ForQuestionAnswering(Qwen3PreTrainedModel):
1462
+ base_model_prefix = "model"
1463
+
1464
+ # Copied from models.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Qwen3
1465
+ def __init__(self, config):
1466
+ super().__init__(config)
1467
+ self.model = Qwen3Model(config)
1468
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1469
+
1470
+ # Initialize weights and apply final processing
1471
+ self.post_init()
1472
+
1473
+ def get_input_embeddings(self):
1474
+ return self.model.embed_tokens
1475
+
1476
+ def set_input_embeddings(self, value):
1477
+ self.model.embed_tokens = value
1478
+
1479
+ @add_start_docstrings_to_model_forward(QWEN3_INPUTS_DOCSTRING)
1480
+ def forward(
1481
+ self,
1482
+ input_ids: Optional[torch.LongTensor] = None,
1483
+ attention_mask: Optional[torch.FloatTensor] = None,
1484
+ position_ids: Optional[torch.LongTensor] = None,
1485
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1486
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1487
+ start_positions: Optional[torch.LongTensor] = None,
1488
+ end_positions: Optional[torch.LongTensor] = None,
1489
+ output_attentions: Optional[bool] = None,
1490
+ output_hidden_states: Optional[bool] = None,
1491
+ return_dict: Optional[bool] = None,
1492
+ **kwargs,
1493
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1494
+ r"""
1495
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1496
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1497
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1498
+ are not taken into account for computing the loss.
1499
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1500
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1501
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1502
+ are not taken into account for computing the loss.
1503
+ """
1504
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1505
+
1506
+ outputs = self.model(
1507
+ input_ids,
1508
+ attention_mask=attention_mask,
1509
+ position_ids=position_ids,
1510
+ past_key_values=past_key_values,
1511
+ inputs_embeds=inputs_embeds,
1512
+ output_attentions=output_attentions,
1513
+ output_hidden_states=output_hidden_states,
1514
+ return_dict=return_dict,
1515
+ )
1516
+
1517
+ sequence_output = outputs[0]
1518
+
1519
+ logits = self.qa_outputs(sequence_output)
1520
+ start_logits, end_logits = logits.split(1, dim=-1)
1521
+ start_logits = start_logits.squeeze(-1).contiguous()
1522
+ end_logits = end_logits.squeeze(-1).contiguous()
1523
+
1524
+ loss = None
1525
+ if start_positions is not None and end_positions is not None:
1526
+ loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
1527
+
1528
+ if not return_dict:
1529
+ output = (start_logits, end_logits) + outputs[2:]
1530
+ return ((loss,) + output) if loss is not None else output
1531
+
1532
+ return QuestionAnsweringModelOutput(
1533
+ loss=loss,
1534
+ start_logits=start_logits,
1535
+ end_logits=end_logits,
1536
+ hidden_states=outputs.hidden_states,
1537
+ attentions=outputs.attentions,
1538
+ )
1539
+
1B_gate_headwise/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fb9c53e4d92b8e83dcc82bd243f62c584f691c7afcd857572b335723619f8a
3
+ size 3443609378
1B_gate_headwise/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
1B_gate_headwise/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|object_ref_start|>",
185
+ "<|object_ref_end|>",
186
+ "<|box_start|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_start|>",
191
+ "<|vision_end|>",
192
+ "<|vision_pad|>",
193
+ "<|image_pad|>",
194
+ "<|video_pad|>"
195
+ ],
196
+ "bos_token": null,
197
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "model_max_length": 131072,
202
+ "pad_token": "<|endoftext|>",
203
+ "split_special_tokens": false,
204
+ "tokenizer_class": "Qwen2Tokenizer",
205
+ "unk_token": null,
206
+ "add_bos_token": false
207
+ }
1B_gate_headwise/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
README.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gated Attention: Implementation and Visualization
2
+
3
+ This repository contains the implementation of **gated attention** mechanisms based on [Qwen3](https://github.com/QwenLM/Qwen3) model architecture, along with tools for visualizing attention maps. Our modifications are based on findings from recent research that demonstrate how applying **sparse, head-specific gating after Scaled Dot-Product Attention (SDPA)** can significantly improve performance, training stability, and long-context generalization. More details are in our paper.
4
+
5
+ ## 📚 Introduction
6
+
7
+ Gating mechanisms have long been a cornerstone of neural network design, enabling dynamic control over information flow. In this work, we focus on integrating and evaluating these mechanisms within standard softmax attention layers of transformer models.
8
+
9
+ We introduce a **query-dependent sparse gate** after the SDPA output (`G1`), which modulates each attention head independently using a sigmoid function. This simple yet effective change:
10
+
11
+ - Introduces **non-linearity** into the low-rank transformation formed by value and output projections.
12
+ - Enables **input-dependent sparsity**, preventing the "attention sink" phenomenon where early tokens dominate attention distributions.
13
+ - Improves **training stability**, allowing larger learning rates.
14
+ - Enhances **long-context extrapolation**, showing significant gains on benchmarks like RULER.
15
+
16
+ ---
17
+
18
+ ## 📦 Models
19
+
20
+ We provide variants of the Qwen3 model with different gating configurations:
21
+
22
+ - `baseline`: Standard attention without any gating.
23
+ - `gate_headwise`: Headwise gating applied after SDPA.
24
+ - `gate_elementwise`: Elementwise gating applied after SDPA.
25
+
26
+ All models are compatible with HuggingFace Transformers APIs.
27
+
28
+ ---
29
+
30
+
31
+ ## 🧪 Demo Usage
32
+
33
+ A demo script is included to load a trained model and visualize attention maps with gating enabled.
34
+
35
+ ### Requirements
36
+
37
+ ```bash
38
+ pip install transformers matplotlib numpy torch
39
+ ```
40
+
41
+ ### Run Demo
42
+
43
+ ```bash
44
+ python demo.py
45
+ ```
46
+
47
+ This will produce a file named `{model_name}_selected_layer_attention_maps.png`, showing attention maps for four key layers.
48
+
49
+ #### Attention Maps Comparison
50
+
51
+ Below are the attention maps from **Layer 1**, **Layer 7**, **Layer 21**, and **Layer 28** of three different model variants: `baseline`, `gate_headwise`, and `gate_elementwise`. These visualizations help illustrate how gating mechanisms affect attention patterns, especially in relation to the "attention sink" phenomenon.
52
+
53
+ In the **baseline** model, we observe a strong "attention sink" effect — the **first token** consistently receives disproportionately high attention scores across multiple layers. This indicates that the model overly relies on the initial token, potentially limiting its ability to distribute attention meaningfully across other positions.
54
+
55
+ ##### Baseline Model
56
+
57
+ ![baseline](baseline_selected_layer_attention_maps.png)
58
+
59
+ > **Observation**: Strong diagonal dominance with significant focus on the first token (attention sink). This pattern persists across multiple layers.
60
+
61
+ ##### Gate Headwise Model
62
+
63
+ ![headwise](gate_headwise_selected_layer_attention_maps.png)
64
+
65
+ > **Observation**: Gating applied headwise reduces the attention sink effect. Attention becomes more distributed and context-dependent.
66
+
67
+ ##### Gate Elementwise Model
68
+
69
+ ![elementwise](gate_elementwise_selected_layer_attention_maps.png)
70
+
71
+ > **Observation**: Elementwise gating further enhances sparsity and selectivity in attention patterns, leading to cleaner and more structured attention maps.
72
+
73
+ ---
74
+
75
+ ## 📁 Repository Structure
76
+
77
+ ```sh
78
+ qwen3-gated/
79
+ ├── 1B_baseline # Baseline in the same architecture of Qwen3 model
80
+ ├── 1B_gate_elementwise # Model augmented with elemenwise SDPA outup gating
81
+ ├── 1B_gate_headwise # Model augmented with headwise SDPA outup gating
82
+ ├── demo.py # Simple demo for loading model and extracting
83
+ └── README.md # You are here
84
+ ```
85
+
86
+ ---
87
+
88
+ ## 🔧 Implementation Details
89
+
90
+ The core changes to implement gated attention are found in the `Qwen3Attention` class in the provided code.
91
+
92
+ ### 🧠 Gating Variants
93
+
94
+ We support two main types of gating:
95
+
96
+ #### 1. **Headwise Gating**
97
+
98
+ Each attention head has its own gate scalar.
99
+
100
+ ```python
101
+ self.headwise_attn_output_gate = True
102
+ ```
103
+
104
+ These options can be configured in the model config under:
105
+
106
+ ```json
107
+ {
108
+ "headwise_attn_output_gate": true,
109
+ "elementwise_attn_output_gate": false
110
+ }
111
+ ```
112
+
113
+ #### 2. **Elementwise Gating**
114
+
115
+ Each element of the attention output is modulated independently.
116
+
117
+ ```python
118
+ self.elementwise_attn_output_gate = True
119
+ ```
120
+
121
+ These options can be configured in the model config under:
122
+
123
+ ```json
124
+ {
125
+ "headwise_attn_output_gate": false,
126
+ "elementwise_attn_output_gate": true
127
+ }
128
+ ```
129
+
130
+ ---
131
+
132
+
133
+ <!-- ## 📝 Citation
134
+
135
+ If you use this code or models in your research, please cite our paper:
136
+
137
+ ```bibtex
138
+ @inproceedings{lin2025forgetting,
139
+ title={Sparse Gating Mitigates Attention Sink and Enhances Long-Context Modeling},
140
+ author={Lin, Yaxin and Xiao, Tianhe and Csordás, Róbert and Gu, Aohan and Zhang, Peng and Yang, Jian and Hua, Xinyi and Sun, Yu and Lin, Zhongkai and Liu, Hang and Tang, Yiyuan and Zhou, Minghai and Wang, Hongxia and Li, Junnan and Yuan, Zilong and Zhang, Yiren and Darcet, Théo and Chen, Bo and Cao, Qingqing and Lu, Haibo and Wang, Yizheng and Zhang, Kai and Zhang, Han and Wang, Shuming and Wu, Weiwei and Huang, Weikang and He, Yanyan and Xu, Jialin and Li, Xiaoyu and Zhao, Yue and Chen, Zhiyuan and Zhang, Yujie and Zhang, Xuefeng and Chen, Jing and Wang, Yanan and Wang, Jiaqi and Liu, Yuxiao and Liu, Chao and Liu, Yifan and Li, Meng and Zhu, Yunpeng and Jiang, Yufei and Li, Cheng and Zhang, Yuhao and Wang, Shuo and Zhang, Peng and Ma, Yuting and Wang, Xiaolong and Li, Yaxin and Zhang, Yaxin and Zhang, Peng and Zhang, Yiren and Zhang, Yujie and Zhang, Xuefeng and Zhang, Kai and Zhang, Han and Zhang, Yiren and Zhang, Yujie and Zhang, Xuefeng and Zhang, Kai and Zhang, Han},
141
+ year={2025}
142
+ }
143
+ ``` -->
144
+
145
+ ## 📬 Contact
146
+
147
+ For questions or collaboration opportunities, feel free to reach out at <[email protected]>.
baseline_selected_layer_attention_maps.png ADDED
demoe.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+
6
+ # Set device
7
+ base_dir = './'
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ # Loop through different model variants
11
+ for name in ['baseline', 'gate_elementwise', 'gate_headwise']:
12
+
13
+ # Load model and tokenizer
14
+ model_name_or_path = f"{base_dir}/1B_{name}"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
16
+ model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(device)
17
+
18
+ # Input text
19
+ prompt = "Sparse gating mechanism mitigates attention sink."
20
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
21
+
22
+ # Forward pass with output_attentions=True to retrieve attention scores
23
+ with torch.no_grad():
24
+ outputs = model(
25
+ input_ids=inputs["input_ids"],
26
+ attention_mask=inputs["attention_mask"],
27
+ output_attentions=True # Retrieve attention scores
28
+ )
29
+
30
+ # Extract attention scores
31
+ attentions = outputs.attentions # tuple of tensors: (layer) -> (batch, head, seq_len, seq_len)
32
+
33
+ # Function to average attention scores across all heads for each layer
34
+ def average_heads(attentions):
35
+ averaged = []
36
+ for layer_attn in attentions:
37
+ # layer_attn: (batch, head, seq_len, seq_len)
38
+ avg_attn = layer_attn.mean(dim=1).cpu().numpy() # (batch, seq_len, seq_len)
39
+ averaged.append(avg_attn[0]) # Take the first sample
40
+ return averaged
41
+
42
+ averaged_attentions = average_heads(attentions)
43
+
44
+ # Get tokens for axis labels
45
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
46
+
47
+ # Visualize attention maps of selected layers
48
+ layers_to_visualize = [0, 6, 20, 27] # Python indices start at 0, corresponds to 1st, 7th, 21st, 28th layers
49
+ fig, axes = plt.subplots(2, 2, figsize=(14, 12))
50
+ axes = axes.flatten()
51
+
52
+ for idx, layer_idx in enumerate(layers_to_visualize):
53
+ attn_map = averaged_attentions[layer_idx]
54
+
55
+ # Plot attention map
56
+ ax = axes[idx]
57
+ im = ax.imshow(attn_map, cmap="viridis")
58
+
59
+ # Add colorbar
60
+ fig.colorbar(im, ax=ax)
61
+
62
+ # Set title
63
+ ax.set_title(f"Layer {layer_idx + 1}")
64
+
65
+ # Set ticks and labels
66
+ ax.set_xticks(np.arange(len(tokens)))
67
+ ax.set_yticks(np.arange(len(tokens)))
68
+ ax.set_xticklabels(tokens, rotation=90)
69
+ ax.set_yticklabels(tokens)
70
+
71
+ # Hide tick marks
72
+ ax.tick_params(axis='both', which='both', length=0)
73
+
74
+ plt.tight_layout()
75
+ plt.savefig(f"{name}_selected_layer_attention_maps.png")
76
+ plt.show()
gate_elementwise_selected_layer_attention_maps.png ADDED
gate_headwise_selected_layer_attention_maps.png ADDED