huseinzol05 commited on
Commit
933c9a3
·
verified ·
1 Parent(s): 307d3d6

Upload processor

Browse files
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
llm_audio_processing.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Qwen2Audio.
17
+ """
18
+
19
+ import warnings
20
+ from typing import Union
21
+
22
+ import numpy as np
23
+
24
+ from transformers.feature_extraction_utils import BatchFeature
25
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
26
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
27
+ from transformers.utils.deprecation import deprecate_kwarg
28
+
29
+
30
+ class LLMAudioProcessorKwargs(ProcessingKwargs, total=False):
31
+ _defaults = {
32
+ "text_kwargs": {
33
+ "padding": False,
34
+ },
35
+ "audio_kwargs": {},
36
+ }
37
+
38
+
39
+ class LLMAudioProcessor(ProcessorMixin):
40
+ r"""
41
+ Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.
42
+
43
+ [`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
44
+ [`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.
45
+
46
+ Args:
47
+ feature_extractor ([`WhisperFeatureExtractor`], *optional*):
48
+ The feature extractor is a required input.
49
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
50
+ The tokenizer is a required input.
51
+ chat_template (`Optional[str]`, *optional*):
52
+ The Jinja template to use for formatting the conversation. If not provided, the default chat template
53
+ is used.
54
+ audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
55
+ The token to use for audio tokens.
56
+ audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
57
+ The token to use for audio bos tokens.
58
+ audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
59
+ The token to use for audio eos tokens.
60
+ """
61
+
62
+ attributes = ["feature_extractor", "tokenizer"]
63
+ feature_extractor_class = "WhisperFeatureExtractor"
64
+ tokenizer_class = "AutoTokenizer"
65
+
66
+ def __init__(
67
+ self,
68
+ feature_extractor=None,
69
+ tokenizer=None,
70
+ chat_template=None,
71
+ audio_token="<|AUDIO|>",
72
+ audio_bos_token="<|audio_bos|>",
73
+ audio_eos_token="<|audio_eos|>",
74
+ ):
75
+ if chat_template is None:
76
+ chat_template = self.default_chat_template
77
+ self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
78
+ self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
79
+ self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
80
+ self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
81
+ super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
82
+
83
+ @deprecate_kwarg("audios", version="4.54.0", new_name="audio")
84
+ def __call__(
85
+ self,
86
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
87
+ audio: Union[np.ndarray, list[np.ndarray]] = None,
88
+ audios=None, # kept for BC
89
+ **kwargs: Unpack[LLMAudioProcessorKwargs],
90
+ ) -> BatchFeature:
91
+
92
+ # Handle BC when user passes deprecated keyword argument
93
+ if audios is not None and audio is None:
94
+ audio = audios
95
+ warnings.warn(
96
+ "You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments "
97
+ "with keys `audio` and `text`. From transformers v4.55 `audio` will be the only acceptable keyword argument.",
98
+ FutureWarning,
99
+ )
100
+
101
+ if text is None:
102
+ raise ValueError("You need to specify `text` input to process.")
103
+ elif isinstance(text, str):
104
+ text = [text]
105
+ elif not isinstance(text, list) and not isinstance(text[0], str):
106
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
107
+
108
+ output_kwargs = self._merge_kwargs(
109
+ LLMAudioProcessorKwargs,
110
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
111
+ **kwargs,
112
+ )
113
+
114
+ if audio is not None:
115
+ # ensure we have as much audios as audio tokens
116
+ num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
117
+ num_audios = 1 if type(audio) is np.ndarray else len(audio)
118
+ if num_audio_tokens != num_audios:
119
+ raise ValueError(
120
+ f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
121
+ )
122
+
123
+ # Some kwargs should not be changed so we can expand text with audio tokens below
124
+ output_kwargs["audio_kwargs"]["return_attention_mask"] = True
125
+ output_kwargs["audio_kwargs"]["padding"] = "max_length"
126
+ audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
127
+
128
+ # rename attention_mask to prevent conflicts later on
129
+ audio_inputs["feature_attention_mask"] = audio_inputs.pop("attention_mask")
130
+
131
+ expanded_text = []
132
+ audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()
133
+
134
+ for sample in text:
135
+ replace_str = []
136
+ while self.audio_token in sample:
137
+ audio_length = audio_lengths.pop(0)
138
+ input_length = (audio_length - 1) // 2 + 1
139
+ num_audio_tokens = input_length
140
+
141
+ expanded_audio_token = self.audio_token * num_audio_tokens
142
+
143
+ audio_token_start_idx = sample.find(self.audio_token)
144
+ audio_token_end_idx = audio_token_start_idx + len(self.audio_token)
145
+
146
+ has_bos = (
147
+ sample[audio_token_start_idx - len(self.audio_bos_token) : audio_token_start_idx]
148
+ == self.audio_bos_token
149
+ )
150
+ has_eos = (
151
+ sample[audio_token_end_idx : audio_token_end_idx + len(self.audio_eos_token)]
152
+ == self.audio_eos_token
153
+ )
154
+
155
+ # Check if this audio token is surrounded by bos/eos tokens
156
+ if not has_bos and not has_eos:
157
+ expanded_audio_token = self.audio_bos_token + expanded_audio_token + self.audio_eos_token
158
+
159
+ replace_str.append(expanded_audio_token)
160
+ sample = sample.replace(self.audio_token, "<placeholder>", 1)
161
+
162
+ while "<placeholder>" in sample:
163
+ sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
164
+ expanded_text.append(sample)
165
+ text = expanded_text
166
+
167
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
168
+ inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
169
+
170
+ if audio is not None:
171
+ inputs.update(audio_inputs)
172
+
173
+ return BatchFeature(data={**inputs}, tensor_type=return_tensors)
174
+
175
+ def batch_decode(self, *args, **kwargs):
176
+ """
177
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
178
+ refer to the docstring of this method for more information.
179
+ """
180
+ return self.tokenizer.batch_decode(*args, **kwargs)
181
+
182
+ def decode(self, *args, **kwargs):
183
+ """
184
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
185
+ the docstring of this method for more information.
186
+ """
187
+ return self.tokenizer.decode(*args, **kwargs)
188
+
189
+ @property
190
+ def model_input_names(self):
191
+ tokenizer_input_names = self.tokenizer.model_input_names
192
+ feature_extractor_input_names = self.feature_extractor.model_input_names
193
+ return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))
194
+
195
+ @property
196
+ # NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
197
+ def default_chat_template(self):
198
+ """
199
+ This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
200
+ * the template will output the role of the speaker followed by the content of the message.
201
+ * content is a list of strings and audios.
202
+ * If the content element is an audio, the template will output a sequence of <|AUDIO|> tokens
203
+
204
+ Example:
205
+
206
+ ```python
207
+ messages = [
208
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
209
+ {"role": "user", "content": [
210
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
211
+ {"type": "text", "text": "What's that sound?"},
212
+ ]},
213
+ {"role": "assistant", "content": "It is the sound of glass shattering."},
214
+ {"role": "user", "content": [
215
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
216
+ {"type": "text", "text": "How about this one?"},
217
+ ]},
218
+ ]
219
+
220
+ result = template.render(messages=messages, add_generation_prompt=True)
221
+ ```
222
+ """
223
+ # fmt: off
224
+ return (
225
+ "{% set audio_count = namespace(value=0) %}"
226
+ "{% for message in messages %}"
227
+ "{% if loop.first and message['role'] != 'system' %}"
228
+ "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
229
+ "{% endif %}"
230
+ "<|im_start|>{{ message['role'] }}\n"
231
+ "{% if message['content'] is string %}"
232
+ "{{ message['content'] }}<|im_end|>\n"
233
+ "{% else %}"
234
+ "{% for content in message['content'] %}"
235
+ "{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}"
236
+ "{% set audio_count.value = audio_count.value + 1 %}"
237
+ "Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
238
+ "{% elif 'text' in content %}"
239
+ "{{ content['text'] }}"
240
+ "{% endif %}"
241
+ "{% endfor %}"
242
+ "<|im_end|>\n"
243
+ "{% endif %}"
244
+ "{% endfor %}"
245
+ "{% if add_generation_prompt %}"
246
+ "<|im_start|>assistant\n"
247
+ "{% endif %}"
248
+ )
249
+ # fmt: on
250
+
251
+
252
+ __all__ = ["LLMAudioProcessor"]
preprocessor_config.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "chunk_length": 30,
3
  "dither": 0.0,
4
  "feature_extractor_type": "WhisperFeatureExtractor",
@@ -9,7 +12,7 @@
9
  "nb_max_frames": 3000,
10
  "padding_side": "right",
11
  "padding_value": 0.0,
12
- "processor_class": "WhisperProcessor",
13
  "return_attention_mask": false,
14
  "sampling_rate": 16000
15
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
4
+ },
5
  "chunk_length": 30,
6
  "dither": 0.0,
7
  "feature_extractor_type": "WhisperFeatureExtractor",
 
12
  "nb_max_frames": 3000,
13
  "padding_side": "right",
14
  "padding_value": 0.0,
15
+ "processor_class": "LLMAudioProcessor",
16
  "return_attention_mask": false,
17
  "sampling_rate": 16000
18
  }
processor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_bos_token": "<|audio_bos|>",
3
+ "audio_eos_token": "<|audio_eos|>",
4
+ "audio_token": "<|AUDIO|>",
5
+ "auto_map": {
6
+ "AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
7
+ },
8
+ "processor_class": "LLMAudioProcessor"
9
+ }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bb5367c34e9391d39dc1028ea4347622217299c54632a9840a91c1200fa3172
3
- size 11422462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99474fa95a1d2494b675ea499a6a15c177217449a4fe34f52ee130bfee7b723d
3
+ size 11422630
tokenizer_config.json CHANGED
@@ -218,6 +218,9 @@
218
  "<|image_pad|>",
219
  "<|video_pad|>"
220
  ],
 
 
 
221
  "bos_token": null,
222
  "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
223
  "clean_up_tokenization_spaces": false,
@@ -226,6 +229,7 @@
226
  "extra_special_tokens": {},
227
  "model_max_length": 131072,
228
  "pad_token": "<|endoftext|>",
 
229
  "split_special_tokens": false,
230
  "tokenizer_class": "Qwen2Tokenizer",
231
  "unk_token": null
 
218
  "<|image_pad|>",
219
  "<|video_pad|>"
220
  ],
221
+ "auto_map": {
222
+ "AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
223
+ },
224
  "bos_token": null,
225
  "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
226
  "clean_up_tokenization_spaces": false,
 
229
  "extra_special_tokens": {},
230
  "model_max_length": 131072,
231
  "pad_token": "<|endoftext|>",
232
+ "processor_class": "LLMAudioProcessor",
233
  "split_special_tokens": false,
234
  "tokenizer_class": "Qwen2Tokenizer",
235
  "unk_token": null