Upload folder using huggingface_hub
Browse files- config.json +3 -2
- generation_config.json +2 -10
- model-00001-of-00004.safetensors +2 -2
- model-00002-of-00004.safetensors +2 -2
- model-00003-of-00004.safetensors +2 -2
- model-00004-of-00004.safetensors +2 -2
- model.safetensors.index.json +0 -0
- qwen_model_v2.py +253 -0
config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"attention_dropout": 0.0,
|
7 |
"audio_encoder_config": {
|
@@ -107,6 +107,7 @@
|
|
107 |
},
|
108 |
"audio_token_index": 151665,
|
109 |
"auto_map": {
|
|
|
110 |
"AutoModelForCausalLM": "qwen_model.Model"
|
111 |
},
|
112 |
"bos_token_id": 151643,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "./7b-audio-instruct",
|
3 |
"architectures": [
|
4 |
+
"LLMAudioForConditionalGeneration"
|
5 |
],
|
6 |
"attention_dropout": 0.0,
|
7 |
"audio_encoder_config": {
|
|
|
107 |
},
|
108 |
"audio_token_index": 151665,
|
109 |
"auto_map": {
|
110 |
+
"AutoModel": "qwen_model_v2.LLMAudioForConditionalGeneration",
|
111 |
"AutoModelForCausalLM": "qwen_model.Model"
|
112 |
},
|
113 |
"bos_token_id": 151643,
|
generation_config.json
CHANGED
@@ -1,14 +1,6 @@
|
|
1 |
{
|
|
|
2 |
"bos_token_id": 151643,
|
3 |
-
"
|
4 |
-
"eos_token_id": [
|
5 |
-
151645,
|
6 |
-
151643
|
7 |
-
],
|
8 |
-
"pad_token_id": 151643,
|
9 |
-
"repetition_penalty": 1.05,
|
10 |
-
"temperature": 0.7,
|
11 |
-
"top_k": 20,
|
12 |
-
"top_p": 0.8,
|
13 |
"transformers_version": "4.46.0"
|
14 |
}
|
|
|
1 |
{
|
2 |
+
"_from_model_config": true,
|
3 |
"bos_token_id": 151643,
|
4 |
+
"eos_token_id": 151645,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"transformers_version": "4.46.0"
|
6 |
}
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74cad97da6d98ef47f294c6371b0b2a0f2b674de096cb9740b8f9b9ce4e6325a
|
3 |
+
size 4895442400
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e143cdf06f0968c76fabfc97ae318de5546ce1fa7b5d26d4fe23c7d239edec1a
|
3 |
+
size 4991497784
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f61bd34b0bd61a328e6a80c686cb9f17640c7d861efd92c26908e3f738f35829
|
3 |
+
size 4932752872
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfa4d74bf95c2afbc8361a8bf5288ca3637eb5c71a81e8bf7ae8d29a18f0bf77
|
3 |
+
size 1689086112
|
model.safetensors.index.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
qwen_model_v2.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Qwen2ForCausalLM, AutoModel, Qwen2Config
|
2 |
+
from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
|
3 |
+
from transformers import WhisperPreTrainedModel, WhisperConfig
|
4 |
+
from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
|
5 |
+
from transformers.modeling_utils import PreTrainedModel
|
6 |
+
from transformers.generation import GenerationMixin
|
7 |
+
from transformers.models.auto import AutoModel, AutoModelForCausalLM
|
8 |
+
from torch import nn
|
9 |
+
import torch
|
10 |
+
import math
|
11 |
+
import logging
|
12 |
+
|
13 |
+
class WhisperEncoder(WhisperPreTrainedModel):
|
14 |
+
|
15 |
+
def __init__(self, config: WhisperConfig):
|
16 |
+
super().__init__(config)
|
17 |
+
self.dropout = config.dropout
|
18 |
+
self.layerdrop = config.encoder_layerdrop
|
19 |
+
|
20 |
+
embed_dim = config.d_model
|
21 |
+
self.num_mel_bins = config.num_mel_bins
|
22 |
+
self.padding_idx = config.pad_token_id
|
23 |
+
self.max_source_positions = config.max_source_positions
|
24 |
+
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
25 |
+
|
26 |
+
self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
|
27 |
+
self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
|
28 |
+
|
29 |
+
self.register_buffer('range_max_source_positions', torch.arange(self.max_source_positions))
|
30 |
+
|
31 |
+
self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
|
32 |
+
self.embed_positions.requires_grad_(False)
|
33 |
+
|
34 |
+
self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
|
35 |
+
self.layer_norm = nn.LayerNorm(config.d_model)
|
36 |
+
|
37 |
+
self.gradient_checkpointing = False
|
38 |
+
self.post_init()
|
39 |
+
|
40 |
+
def _freeze_parameters(self):
|
41 |
+
for param in self.parameters():
|
42 |
+
param.requires_grad = False
|
43 |
+
self._requires_grad = False
|
44 |
+
|
45 |
+
def get_input_embeddings(self) -> nn.Module:
|
46 |
+
return self.conv1
|
47 |
+
|
48 |
+
def set_input_embeddings(self, value: nn.Module):
|
49 |
+
self.conv1 = value
|
50 |
+
|
51 |
+
def forward(
|
52 |
+
self,
|
53 |
+
input_features,
|
54 |
+
attention_mask=None,
|
55 |
+
head_mask=None,
|
56 |
+
output_attentions=None,
|
57 |
+
output_hidden_states=None,
|
58 |
+
return_dict=None,
|
59 |
+
):
|
60 |
+
|
61 |
+
expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
|
62 |
+
if input_features.shape[-1] != expected_seq_length:
|
63 |
+
raise ValueError(
|
64 |
+
f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
|
65 |
+
)
|
66 |
+
|
67 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
68 |
+
output_hidden_states = (
|
69 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
70 |
+
)
|
71 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
72 |
+
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
73 |
+
inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
|
74 |
+
|
75 |
+
inputs_embeds = inputs_embeds.permute(0, 2, 1)
|
76 |
+
embed_pos = self.embed_positions(self.range_max_source_positions)
|
77 |
+
|
78 |
+
hidden_states = inputs_embeds + embed_pos
|
79 |
+
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
80 |
+
|
81 |
+
encoder_states = () if output_hidden_states else None
|
82 |
+
all_attentions = () if output_attentions else None
|
83 |
+
|
84 |
+
# check if head_mask has a correct number of layers specified if desired
|
85 |
+
if head_mask is not None:
|
86 |
+
assert head_mask.size()[0] == (len(self.layers)), (
|
87 |
+
f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
88 |
+
)
|
89 |
+
|
90 |
+
for idx, encoder_layer in enumerate(self.layers):
|
91 |
+
if output_hidden_states:
|
92 |
+
encoder_states = encoder_states + (hidden_states,)
|
93 |
+
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
94 |
+
to_drop = False
|
95 |
+
if self.training:
|
96 |
+
dropout_probability = torch.rand([])
|
97 |
+
if dropout_probability < self.layerdrop: # skip the layer
|
98 |
+
to_drop = True
|
99 |
+
|
100 |
+
if to_drop:
|
101 |
+
layer_outputs = (None, None)
|
102 |
+
else:
|
103 |
+
if self.gradient_checkpointing and self.training:
|
104 |
+
layer_outputs = self._gradient_checkpointing_func(
|
105 |
+
encoder_layer.__call__,
|
106 |
+
hidden_states,
|
107 |
+
None,
|
108 |
+
(head_mask[idx] if head_mask is not None else None),
|
109 |
+
output_attentions,
|
110 |
+
)
|
111 |
+
else:
|
112 |
+
layer_outputs = encoder_layer(
|
113 |
+
hidden_states,
|
114 |
+
None,
|
115 |
+
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
116 |
+
output_attentions=output_attentions,
|
117 |
+
)
|
118 |
+
|
119 |
+
hidden_states = layer_outputs[0]
|
120 |
+
|
121 |
+
if output_attentions:
|
122 |
+
all_attentions = all_attentions + (layer_outputs[1],)
|
123 |
+
|
124 |
+
hidden_states = self.layer_norm(hidden_states)
|
125 |
+
if output_hidden_states:
|
126 |
+
encoder_states = encoder_states + (hidden_states,)
|
127 |
+
|
128 |
+
if not return_dict:
|
129 |
+
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
|
130 |
+
return BaseModelOutput(
|
131 |
+
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
|
132 |
+
)
|
133 |
+
|
134 |
+
class LLMAudioPreTrainedModel(PreTrainedModel):
|
135 |
+
config_class = Qwen2Config
|
136 |
+
base_model_prefix = "model"
|
137 |
+
supports_gradient_checkpointing = True
|
138 |
+
_skip_keys_device_placement = "past_key_values"
|
139 |
+
_supports_flash_attn_2 = True
|
140 |
+
_supports_sdpa = True
|
141 |
+
|
142 |
+
def _init_weights(self, module):
|
143 |
+
std = (
|
144 |
+
self.config.initializer_range
|
145 |
+
if hasattr(self.config, "initializer_range")
|
146 |
+
else self.config.audio_config.initializer_range
|
147 |
+
)
|
148 |
+
|
149 |
+
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
150 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
151 |
+
if module.bias is not None:
|
152 |
+
module.bias.data.zero_()
|
153 |
+
elif isinstance(module, nn.Embedding):
|
154 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
155 |
+
if module.padding_idx is not None:
|
156 |
+
module.weight.data[module.padding_idx].zero_()
|
157 |
+
|
158 |
+
class LLMAudioForConditionalGeneration(LLMAudioPreTrainedModel, GenerationMixin):
|
159 |
+
def __init__(self, config):
|
160 |
+
super().__init__(config)
|
161 |
+
|
162 |
+
audio_config = WhisperConfig.from_dict(config.audio_encoder_config)
|
163 |
+
self.encoder = WhisperEncoder(audio_config)
|
164 |
+
self.projection = nn.Linear(self.encoder.config.d_model, self.config.hidden_size, bias=False)
|
165 |
+
self.language_model = AutoModelForCausalLM.from_config(config)
|
166 |
+
if self.language_model._tied_weights_keys is not None:
|
167 |
+
self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
|
168 |
+
|
169 |
+
def get_input_embeddings(self):
|
170 |
+
return self.language_model.get_input_embeddings()
|
171 |
+
|
172 |
+
def set_input_embeddings(self, value):
|
173 |
+
self.language_model.set_input_embeddings(value)
|
174 |
+
|
175 |
+
def get_output_embeddings(self):
|
176 |
+
return self.language_model.get_output_embeddings()
|
177 |
+
|
178 |
+
def set_output_embeddings(self, new_embeddings):
|
179 |
+
self.language_model.set_output_embeddings(new_embeddings)
|
180 |
+
|
181 |
+
def set_decoder(self, decoder):
|
182 |
+
self.language_model.set_decoder(decoder)
|
183 |
+
|
184 |
+
def get_decoder(self):
|
185 |
+
return self.language_model.get_decoder()
|
186 |
+
|
187 |
+
def forward(
|
188 |
+
self,
|
189 |
+
input_ids,
|
190 |
+
attention_mask = None,
|
191 |
+
position_ids = None,
|
192 |
+
input_features = None,
|
193 |
+
feature_attention_mask = None,
|
194 |
+
past_key_values = None,
|
195 |
+
print_input_features_shape = False,
|
196 |
+
inputs_embeds = None,
|
197 |
+
**kwargs,
|
198 |
+
):
|
199 |
+
target_device = self.encoder.device
|
200 |
+
|
201 |
+
if input_features is not None:
|
202 |
+
input_features = input_features.to(target_device)
|
203 |
+
feature_attention_mask = feature_attention_mask.to(target_device)
|
204 |
+
|
205 |
+
if inputs_embeds is None:
|
206 |
+
inputs_embeds = self.get_input_embeddings()(input_ids)
|
207 |
+
if input_features is not None and input_ids.shape[1] != 1:
|
208 |
+
batch_size, _, max_mel_seq_len = input_features.shape
|
209 |
+
max_seq_len = (max_mel_seq_len - 2) // 2 + 1
|
210 |
+
audio_feat_lengths = self.encoder._get_feat_extract_output_lengths(feature_attention_mask.sum(-1))
|
211 |
+
seq_range = (
|
212 |
+
torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
|
213 |
+
.unsqueeze(0)
|
214 |
+
.expand(batch_size, max_seq_len)
|
215 |
+
)
|
216 |
+
lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
|
217 |
+
padding_mask = seq_range >= lengths_expand
|
218 |
+
|
219 |
+
audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
|
220 |
+
batch_size, 1, max_seq_len, max_seq_len
|
221 |
+
)
|
222 |
+
audio_attention_mask = audio_attention_mask_.to(
|
223 |
+
dtype=self.encoder.conv1.weight.dtype, device=self.encoder.conv1.weight.device
|
224 |
+
)
|
225 |
+
audio_attention_mask[audio_attention_mask_] = float("-inf")
|
226 |
+
audio_outputs = self.encoder(input_features, attention_mask=audio_attention_mask)
|
227 |
+
selected_audio_feature = audio_outputs.last_hidden_state
|
228 |
+
audio_features = self.projection(selected_audio_feature)
|
229 |
+
num_audio_tokens = audio_feat_lengths
|
230 |
+
num_audios, max_audio_tokens, embed_dim = audio_features.shape
|
231 |
+
audio_features_mask = torch.arange(max_audio_tokens).expand(num_audios, max_audio_tokens).to(
|
232 |
+
num_audio_tokens.device
|
233 |
+
) < num_audio_tokens.unsqueeze(1)
|
234 |
+
masked_audio_features = audio_features[audio_features_mask].view(-1, embed_dim)
|
235 |
+
if print_input_features_shape:
|
236 |
+
print(masked_audio_features.shape, masked_audio_features.contiguous())
|
237 |
+
inputs_embeds[input_ids == self.config.audio_token_index] = masked_audio_features.contiguous()
|
238 |
+
|
239 |
+
outputs = self.language_model(
|
240 |
+
inputs_embeds=inputs_embeds,
|
241 |
+
attention_mask=attention_mask,
|
242 |
+
position_ids=position_ids,
|
243 |
+
past_key_values=past_key_values,
|
244 |
+
)
|
245 |
+
|
246 |
+
logits = outputs[0]
|
247 |
+
return CausalLMOutputWithPast(
|
248 |
+
loss=None,
|
249 |
+
logits=logits,
|
250 |
+
past_key_values=outputs.past_key_values,
|
251 |
+
hidden_states=outputs.hidden_states,
|
252 |
+
attentions=outputs.attentions,
|
253 |
+
)
|