huseinzol05 commited on
Commit
307d3d6
·
verified ·
1 Parent(s): 442223d

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "_name_or_path": "./7b-speech-instruct",
3
  "architectures": [
4
- "Model"
5
  ],
6
  "attention_dropout": 0.0,
7
  "audio_encoder_config": {
@@ -107,6 +107,7 @@
107
  },
108
  "audio_token_index": 151665,
109
  "auto_map": {
 
110
  "AutoModelForCausalLM": "qwen_model.Model"
111
  },
112
  "bos_token_id": 151643,
 
1
  {
2
+ "_name_or_path": "./7b-speech-instruct-v1",
3
  "architectures": [
4
+ "LLMAudioForConditionalGeneration"
5
  ],
6
  "attention_dropout": 0.0,
7
  "audio_encoder_config": {
 
107
  },
108
  "audio_token_index": 151665,
109
  "auto_map": {
110
+ "AutoModel": "qwen_model_v2.LLMAudioForConditionalGeneration",
111
  "AutoModelForCausalLM": "qwen_model.Model"
112
  },
113
  "bos_token_id": 151643,
generation_config.json CHANGED
@@ -1,14 +1,6 @@
1
  {
 
2
  "bos_token_id": 151643,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 151645,
6
- 151643
7
- ],
8
- "pad_token_id": 151643,
9
- "repetition_penalty": 1.05,
10
- "temperature": 0.7,
11
- "top_k": 20,
12
- "top_p": 0.8,
13
  "transformers_version": "4.46.0"
14
  }
 
1
  {
2
+ "_from_model_config": true,
3
  "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
 
 
 
 
 
 
 
 
 
5
  "transformers_version": "4.46.0"
6
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8564759f422ecee4fb6c94f0d428eb320d6bd2f3469c783885a083361c3988f8
3
- size 4874822248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73f9484636ebafcc25b696b1f7511adf253dd14ca2153652de2e9953823cf98a
3
+ size 4895442400
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08a0ab69d649c1293d7f1c49749feb59260a2c8e28e1ec9a4b4b18e5cb385b0e
3
- size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109e1723323dcf5ea303e87b4450181c2b4df5446085ff49f06de7b3cdee3ad7
3
+ size 4991497784
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e470ca361e99f6a85af127ecb4f80391ed078d7aed6fc2554a649f400f2afe6
3
- size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f11714fcd99ef9f2aecb013958e70165b6dff5803801ddd6761042f69a2f4e0
3
+ size 4932752872
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3eb3387772c3fbe8e9aa349ed4cbafdd46b118d3418f93e06a8e2d785d145351
3
- size 2370335624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:394652a278904bd992eddaec542ca2540aa5ae4ebce8c580c2615b03bd9abb00
3
+ size 1689086112
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
qwen_model_v2.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Qwen2ForCausalLM, AutoModel, Qwen2Config
2
+ from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
3
+ from transformers import WhisperPreTrainedModel, WhisperConfig
4
+ from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
5
+ from transformers.modeling_utils import PreTrainedModel
6
+ from transformers.generation import GenerationMixin
7
+ from transformers.models.auto import AutoModel, AutoModelForCausalLM
8
+ from torch import nn
9
+ import torch
10
+ import math
11
+ import logging
12
+
13
+ class WhisperEncoder(WhisperPreTrainedModel):
14
+
15
+ def __init__(self, config: WhisperConfig):
16
+ super().__init__(config)
17
+ self.dropout = config.dropout
18
+ self.layerdrop = config.encoder_layerdrop
19
+
20
+ embed_dim = config.d_model
21
+ self.num_mel_bins = config.num_mel_bins
22
+ self.padding_idx = config.pad_token_id
23
+ self.max_source_positions = config.max_source_positions
24
+ self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
25
+
26
+ self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
27
+ self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
28
+
29
+ self.register_buffer('range_max_source_positions', torch.arange(self.max_source_positions))
30
+
31
+ self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
32
+ self.embed_positions.requires_grad_(False)
33
+
34
+ self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
35
+ self.layer_norm = nn.LayerNorm(config.d_model)
36
+
37
+ self.gradient_checkpointing = False
38
+ self.post_init()
39
+
40
+ def _freeze_parameters(self):
41
+ for param in self.parameters():
42
+ param.requires_grad = False
43
+ self._requires_grad = False
44
+
45
+ def get_input_embeddings(self) -> nn.Module:
46
+ return self.conv1
47
+
48
+ def set_input_embeddings(self, value: nn.Module):
49
+ self.conv1 = value
50
+
51
+ def forward(
52
+ self,
53
+ input_features,
54
+ attention_mask=None,
55
+ head_mask=None,
56
+ output_attentions=None,
57
+ output_hidden_states=None,
58
+ return_dict=None,
59
+ ):
60
+
61
+ expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
62
+ if input_features.shape[-1] != expected_seq_length:
63
+ raise ValueError(
64
+ f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
65
+ )
66
+
67
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
68
+ output_hidden_states = (
69
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
70
+ )
71
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
72
+ inputs_embeds = nn.functional.gelu(self.conv1(input_features))
73
+ inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
74
+
75
+ inputs_embeds = inputs_embeds.permute(0, 2, 1)
76
+ embed_pos = self.embed_positions(self.range_max_source_positions)
77
+
78
+ hidden_states = inputs_embeds + embed_pos
79
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
80
+
81
+ encoder_states = () if output_hidden_states else None
82
+ all_attentions = () if output_attentions else None
83
+
84
+ # check if head_mask has a correct number of layers specified if desired
85
+ if head_mask is not None:
86
+ assert head_mask.size()[0] == (len(self.layers)), (
87
+ f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
88
+ )
89
+
90
+ for idx, encoder_layer in enumerate(self.layers):
91
+ if output_hidden_states:
92
+ encoder_states = encoder_states + (hidden_states,)
93
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
94
+ to_drop = False
95
+ if self.training:
96
+ dropout_probability = torch.rand([])
97
+ if dropout_probability < self.layerdrop: # skip the layer
98
+ to_drop = True
99
+
100
+ if to_drop:
101
+ layer_outputs = (None, None)
102
+ else:
103
+ if self.gradient_checkpointing and self.training:
104
+ layer_outputs = self._gradient_checkpointing_func(
105
+ encoder_layer.__call__,
106
+ hidden_states,
107
+ None,
108
+ (head_mask[idx] if head_mask is not None else None),
109
+ output_attentions,
110
+ )
111
+ else:
112
+ layer_outputs = encoder_layer(
113
+ hidden_states,
114
+ None,
115
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
116
+ output_attentions=output_attentions,
117
+ )
118
+
119
+ hidden_states = layer_outputs[0]
120
+
121
+ if output_attentions:
122
+ all_attentions = all_attentions + (layer_outputs[1],)
123
+
124
+ hidden_states = self.layer_norm(hidden_states)
125
+ if output_hidden_states:
126
+ encoder_states = encoder_states + (hidden_states,)
127
+
128
+ if not return_dict:
129
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
130
+ return BaseModelOutput(
131
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
132
+ )
133
+
134
+ class LLMAudioPreTrainedModel(PreTrainedModel):
135
+ config_class = Qwen2Config
136
+ base_model_prefix = "model"
137
+ supports_gradient_checkpointing = True
138
+ _skip_keys_device_placement = "past_key_values"
139
+ _supports_flash_attn_2 = True
140
+ _supports_sdpa = True
141
+
142
+ def _init_weights(self, module):
143
+ std = (
144
+ self.config.initializer_range
145
+ if hasattr(self.config, "initializer_range")
146
+ else self.config.audio_config.initializer_range
147
+ )
148
+
149
+ if isinstance(module, (nn.Linear, nn.Conv1d)):
150
+ module.weight.data.normal_(mean=0.0, std=std)
151
+ if module.bias is not None:
152
+ module.bias.data.zero_()
153
+ elif isinstance(module, nn.Embedding):
154
+ module.weight.data.normal_(mean=0.0, std=std)
155
+ if module.padding_idx is not None:
156
+ module.weight.data[module.padding_idx].zero_()
157
+
158
+ class LLMAudioForConditionalGeneration(LLMAudioPreTrainedModel, GenerationMixin):
159
+ def __init__(self, config):
160
+ super().__init__(config)
161
+
162
+ audio_config = WhisperConfig.from_dict(config.audio_encoder_config)
163
+ self.encoder = WhisperEncoder(audio_config)
164
+ self.projection = nn.Linear(self.encoder.config.d_model, self.config.hidden_size, bias=False)
165
+ self.language_model = AutoModelForCausalLM.from_config(config)
166
+ if self.language_model._tied_weights_keys is not None:
167
+ self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
168
+
169
+ def get_input_embeddings(self):
170
+ return self.language_model.get_input_embeddings()
171
+
172
+ def set_input_embeddings(self, value):
173
+ self.language_model.set_input_embeddings(value)
174
+
175
+ def get_output_embeddings(self):
176
+ return self.language_model.get_output_embeddings()
177
+
178
+ def set_output_embeddings(self, new_embeddings):
179
+ self.language_model.set_output_embeddings(new_embeddings)
180
+
181
+ def set_decoder(self, decoder):
182
+ self.language_model.set_decoder(decoder)
183
+
184
+ def get_decoder(self):
185
+ return self.language_model.get_decoder()
186
+
187
+ def forward(
188
+ self,
189
+ input_ids,
190
+ attention_mask = None,
191
+ position_ids = None,
192
+ input_features = None,
193
+ feature_attention_mask = None,
194
+ past_key_values = None,
195
+ print_input_features_shape = False,
196
+ inputs_embeds = None,
197
+ **kwargs,
198
+ ):
199
+ target_device = self.encoder.device
200
+
201
+ if input_features is not None:
202
+ input_features = input_features.to(target_device)
203
+ feature_attention_mask = feature_attention_mask.to(target_device)
204
+
205
+ if inputs_embeds is None:
206
+ inputs_embeds = self.get_input_embeddings()(input_ids)
207
+ if input_features is not None and input_ids.shape[1] != 1:
208
+ batch_size, _, max_mel_seq_len = input_features.shape
209
+ max_seq_len = (max_mel_seq_len - 2) // 2 + 1
210
+ audio_feat_lengths = self.encoder._get_feat_extract_output_lengths(feature_attention_mask.sum(-1))
211
+ seq_range = (
212
+ torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
213
+ .unsqueeze(0)
214
+ .expand(batch_size, max_seq_len)
215
+ )
216
+ lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
217
+ padding_mask = seq_range >= lengths_expand
218
+
219
+ audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
220
+ batch_size, 1, max_seq_len, max_seq_len
221
+ )
222
+ audio_attention_mask = audio_attention_mask_.to(
223
+ dtype=self.encoder.conv1.weight.dtype, device=self.encoder.conv1.weight.device
224
+ )
225
+ audio_attention_mask[audio_attention_mask_] = float("-inf")
226
+ audio_outputs = self.encoder(input_features, attention_mask=audio_attention_mask)
227
+ selected_audio_feature = audio_outputs.last_hidden_state
228
+ audio_features = self.projection(selected_audio_feature)
229
+ num_audio_tokens = audio_feat_lengths
230
+ num_audios, max_audio_tokens, embed_dim = audio_features.shape
231
+ audio_features_mask = torch.arange(max_audio_tokens).expand(num_audios, max_audio_tokens).to(
232
+ num_audio_tokens.device
233
+ ) < num_audio_tokens.unsqueeze(1)
234
+ masked_audio_features = audio_features[audio_features_mask].view(-1, embed_dim)
235
+ if print_input_features_shape:
236
+ print(masked_audio_features.shape, masked_audio_features.contiguous())
237
+ inputs_embeds[input_ids == self.config.audio_token_index] = masked_audio_features.contiguous()
238
+
239
+ outputs = self.language_model(
240
+ inputs_embeds=inputs_embeds,
241
+ attention_mask=attention_mask,
242
+ position_ids=position_ids,
243
+ past_key_values=past_key_values,
244
+ )
245
+
246
+ logits = outputs[0]
247
+ return CausalLMOutputWithPast(
248
+ loss=None,
249
+ logits=logits,
250
+ past_key_values=outputs.past_key_values,
251
+ hidden_states=outputs.hidden_states,
252
+ attentions=outputs.attentions,
253
+ )