Upload processor
Browse files- chat_template.json +3 -0
- llm_audio_processing.py +252 -0
- preprocessor_config.json +4 -1
- processor_config.json +9 -0
- tokenizer.json +2 -2
- tokenizer_config.json +4 -0
chat_template.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
|
3 |
+
}
|
llm_audio_processing.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2024 The HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""
|
16 |
+
Processor class for Qwen2Audio.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import warnings
|
20 |
+
from typing import Union
|
21 |
+
|
22 |
+
import numpy as np
|
23 |
+
|
24 |
+
from transformers.feature_extraction_utils import BatchFeature
|
25 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
26 |
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
27 |
+
from transformers.utils.deprecation import deprecate_kwarg
|
28 |
+
|
29 |
+
|
30 |
+
class LLMAudioProcessorKwargs(ProcessingKwargs, total=False):
|
31 |
+
_defaults = {
|
32 |
+
"text_kwargs": {
|
33 |
+
"padding": False,
|
34 |
+
},
|
35 |
+
"audio_kwargs": {},
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
class LLMAudioProcessor(ProcessorMixin):
|
40 |
+
r"""
|
41 |
+
Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.
|
42 |
+
|
43 |
+
[`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
|
44 |
+
[`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
feature_extractor ([`WhisperFeatureExtractor`], *optional*):
|
48 |
+
The feature extractor is a required input.
|
49 |
+
tokenizer ([`Qwen2TokenizerFast`], *optional*):
|
50 |
+
The tokenizer is a required input.
|
51 |
+
chat_template (`Optional[str]`, *optional*):
|
52 |
+
The Jinja template to use for formatting the conversation. If not provided, the default chat template
|
53 |
+
is used.
|
54 |
+
audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
|
55 |
+
The token to use for audio tokens.
|
56 |
+
audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
|
57 |
+
The token to use for audio bos tokens.
|
58 |
+
audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
|
59 |
+
The token to use for audio eos tokens.
|
60 |
+
"""
|
61 |
+
|
62 |
+
attributes = ["feature_extractor", "tokenizer"]
|
63 |
+
feature_extractor_class = "WhisperFeatureExtractor"
|
64 |
+
tokenizer_class = "AutoTokenizer"
|
65 |
+
|
66 |
+
def __init__(
|
67 |
+
self,
|
68 |
+
feature_extractor=None,
|
69 |
+
tokenizer=None,
|
70 |
+
chat_template=None,
|
71 |
+
audio_token="<|AUDIO|>",
|
72 |
+
audio_bos_token="<|audio_bos|>",
|
73 |
+
audio_eos_token="<|audio_eos|>",
|
74 |
+
):
|
75 |
+
if chat_template is None:
|
76 |
+
chat_template = self.default_chat_template
|
77 |
+
self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
|
78 |
+
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
|
79 |
+
self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
|
80 |
+
self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
|
81 |
+
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
|
82 |
+
|
83 |
+
@deprecate_kwarg("audios", version="4.54.0", new_name="audio")
|
84 |
+
def __call__(
|
85 |
+
self,
|
86 |
+
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
87 |
+
audio: Union[np.ndarray, list[np.ndarray]] = None,
|
88 |
+
audios=None, # kept for BC
|
89 |
+
**kwargs: Unpack[LLMAudioProcessorKwargs],
|
90 |
+
) -> BatchFeature:
|
91 |
+
|
92 |
+
# Handle BC when user passes deprecated keyword argument
|
93 |
+
if audios is not None and audio is None:
|
94 |
+
audio = audios
|
95 |
+
warnings.warn(
|
96 |
+
"You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments "
|
97 |
+
"with keys `audio` and `text`. From transformers v4.55 `audio` will be the only acceptable keyword argument.",
|
98 |
+
FutureWarning,
|
99 |
+
)
|
100 |
+
|
101 |
+
if text is None:
|
102 |
+
raise ValueError("You need to specify `text` input to process.")
|
103 |
+
elif isinstance(text, str):
|
104 |
+
text = [text]
|
105 |
+
elif not isinstance(text, list) and not isinstance(text[0], str):
|
106 |
+
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
107 |
+
|
108 |
+
output_kwargs = self._merge_kwargs(
|
109 |
+
LLMAudioProcessorKwargs,
|
110 |
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
111 |
+
**kwargs,
|
112 |
+
)
|
113 |
+
|
114 |
+
if audio is not None:
|
115 |
+
# ensure we have as much audios as audio tokens
|
116 |
+
num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
|
117 |
+
num_audios = 1 if type(audio) is np.ndarray else len(audio)
|
118 |
+
if num_audio_tokens != num_audios:
|
119 |
+
raise ValueError(
|
120 |
+
f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
|
121 |
+
)
|
122 |
+
|
123 |
+
# Some kwargs should not be changed so we can expand text with audio tokens below
|
124 |
+
output_kwargs["audio_kwargs"]["return_attention_mask"] = True
|
125 |
+
output_kwargs["audio_kwargs"]["padding"] = "max_length"
|
126 |
+
audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
|
127 |
+
|
128 |
+
# rename attention_mask to prevent conflicts later on
|
129 |
+
audio_inputs["feature_attention_mask"] = audio_inputs.pop("attention_mask")
|
130 |
+
|
131 |
+
expanded_text = []
|
132 |
+
audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()
|
133 |
+
|
134 |
+
for sample in text:
|
135 |
+
replace_str = []
|
136 |
+
while self.audio_token in sample:
|
137 |
+
audio_length = audio_lengths.pop(0)
|
138 |
+
input_length = (audio_length - 1) // 2 + 1
|
139 |
+
num_audio_tokens = input_length
|
140 |
+
|
141 |
+
expanded_audio_token = self.audio_token * num_audio_tokens
|
142 |
+
|
143 |
+
audio_token_start_idx = sample.find(self.audio_token)
|
144 |
+
audio_token_end_idx = audio_token_start_idx + len(self.audio_token)
|
145 |
+
|
146 |
+
has_bos = (
|
147 |
+
sample[audio_token_start_idx - len(self.audio_bos_token) : audio_token_start_idx]
|
148 |
+
== self.audio_bos_token
|
149 |
+
)
|
150 |
+
has_eos = (
|
151 |
+
sample[audio_token_end_idx : audio_token_end_idx + len(self.audio_eos_token)]
|
152 |
+
== self.audio_eos_token
|
153 |
+
)
|
154 |
+
|
155 |
+
# Check if this audio token is surrounded by bos/eos tokens
|
156 |
+
if not has_bos and not has_eos:
|
157 |
+
expanded_audio_token = self.audio_bos_token + expanded_audio_token + self.audio_eos_token
|
158 |
+
|
159 |
+
replace_str.append(expanded_audio_token)
|
160 |
+
sample = sample.replace(self.audio_token, "<placeholder>", 1)
|
161 |
+
|
162 |
+
while "<placeholder>" in sample:
|
163 |
+
sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
|
164 |
+
expanded_text.append(sample)
|
165 |
+
text = expanded_text
|
166 |
+
|
167 |
+
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
168 |
+
inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
169 |
+
|
170 |
+
if audio is not None:
|
171 |
+
inputs.update(audio_inputs)
|
172 |
+
|
173 |
+
return BatchFeature(data={**inputs}, tensor_type=return_tensors)
|
174 |
+
|
175 |
+
def batch_decode(self, *args, **kwargs):
|
176 |
+
"""
|
177 |
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
178 |
+
refer to the docstring of this method for more information.
|
179 |
+
"""
|
180 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
181 |
+
|
182 |
+
def decode(self, *args, **kwargs):
|
183 |
+
"""
|
184 |
+
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
185 |
+
the docstring of this method for more information.
|
186 |
+
"""
|
187 |
+
return self.tokenizer.decode(*args, **kwargs)
|
188 |
+
|
189 |
+
@property
|
190 |
+
def model_input_names(self):
|
191 |
+
tokenizer_input_names = self.tokenizer.model_input_names
|
192 |
+
feature_extractor_input_names = self.feature_extractor.model_input_names
|
193 |
+
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))
|
194 |
+
|
195 |
+
@property
|
196 |
+
# NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
|
197 |
+
def default_chat_template(self):
|
198 |
+
"""
|
199 |
+
This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
|
200 |
+
* the template will output the role of the speaker followed by the content of the message.
|
201 |
+
* content is a list of strings and audios.
|
202 |
+
* If the content element is an audio, the template will output a sequence of <|AUDIO|> tokens
|
203 |
+
|
204 |
+
Example:
|
205 |
+
|
206 |
+
```python
|
207 |
+
messages = [
|
208 |
+
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
209 |
+
{"role": "user", "content": [
|
210 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
|
211 |
+
{"type": "text", "text": "What's that sound?"},
|
212 |
+
]},
|
213 |
+
{"role": "assistant", "content": "It is the sound of glass shattering."},
|
214 |
+
{"role": "user", "content": [
|
215 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
|
216 |
+
{"type": "text", "text": "How about this one?"},
|
217 |
+
]},
|
218 |
+
]
|
219 |
+
|
220 |
+
result = template.render(messages=messages, add_generation_prompt=True)
|
221 |
+
```
|
222 |
+
"""
|
223 |
+
# fmt: off
|
224 |
+
return (
|
225 |
+
"{% set audio_count = namespace(value=0) %}"
|
226 |
+
"{% for message in messages %}"
|
227 |
+
"{% if loop.first and message['role'] != 'system' %}"
|
228 |
+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
229 |
+
"{% endif %}"
|
230 |
+
"<|im_start|>{{ message['role'] }}\n"
|
231 |
+
"{% if message['content'] is string %}"
|
232 |
+
"{{ message['content'] }}<|im_end|>\n"
|
233 |
+
"{% else %}"
|
234 |
+
"{% for content in message['content'] %}"
|
235 |
+
"{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}"
|
236 |
+
"{% set audio_count.value = audio_count.value + 1 %}"
|
237 |
+
"Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
|
238 |
+
"{% elif 'text' in content %}"
|
239 |
+
"{{ content['text'] }}"
|
240 |
+
"{% endif %}"
|
241 |
+
"{% endfor %}"
|
242 |
+
"<|im_end|>\n"
|
243 |
+
"{% endif %}"
|
244 |
+
"{% endfor %}"
|
245 |
+
"{% if add_generation_prompt %}"
|
246 |
+
"<|im_start|>assistant\n"
|
247 |
+
"{% endif %}"
|
248 |
+
)
|
249 |
+
# fmt: on
|
250 |
+
|
251 |
+
|
252 |
+
__all__ = ["LLMAudioProcessor"]
|
preprocessor_config.json
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
{
|
|
|
|
|
|
|
2 |
"chunk_length": 30,
|
3 |
"dither": 0.0,
|
4 |
"feature_extractor_type": "WhisperFeatureExtractor",
|
@@ -9,7 +12,7 @@
|
|
9 |
"nb_max_frames": 3000,
|
10 |
"padding_side": "right",
|
11 |
"padding_value": 0.0,
|
12 |
-
"processor_class": "
|
13 |
"return_attention_mask": false,
|
14 |
"sampling_rate": 16000
|
15 |
}
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
|
4 |
+
},
|
5 |
"chunk_length": 30,
|
6 |
"dither": 0.0,
|
7 |
"feature_extractor_type": "WhisperFeatureExtractor",
|
|
|
12 |
"nb_max_frames": 3000,
|
13 |
"padding_side": "right",
|
14 |
"padding_value": 0.0,
|
15 |
+
"processor_class": "LLMAudioProcessor",
|
16 |
"return_attention_mask": false,
|
17 |
"sampling_rate": 16000
|
18 |
}
|
processor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_bos_token": "<|audio_bos|>",
|
3 |
+
"audio_eos_token": "<|audio_eos|>",
|
4 |
+
"audio_token": "<|AUDIO|>",
|
5 |
+
"auto_map": {
|
6 |
+
"AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
|
7 |
+
},
|
8 |
+
"processor_class": "LLMAudioProcessor"
|
9 |
+
}
|
tokenizer.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99474fa95a1d2494b675ea499a6a15c177217449a4fe34f52ee130bfee7b723d
|
3 |
+
size 11422630
|
tokenizer_config.json
CHANGED
@@ -218,6 +218,9 @@
|
|
218 |
"<|image_pad|>",
|
219 |
"<|video_pad|>"
|
220 |
],
|
|
|
|
|
|
|
221 |
"bos_token": null,
|
222 |
"chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
223 |
"clean_up_tokenization_spaces": false,
|
@@ -226,6 +229,7 @@
|
|
226 |
"extra_special_tokens": {},
|
227 |
"model_max_length": 131072,
|
228 |
"pad_token": "<|endoftext|>",
|
|
|
229 |
"split_special_tokens": false,
|
230 |
"tokenizer_class": "Qwen2Tokenizer",
|
231 |
"unk_token": null
|
|
|
218 |
"<|image_pad|>",
|
219 |
"<|video_pad|>"
|
220 |
],
|
221 |
+
"auto_map": {
|
222 |
+
"AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
|
223 |
+
},
|
224 |
"bos_token": null,
|
225 |
"chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
226 |
"clean_up_tokenization_spaces": false,
|
|
|
229 |
"extra_special_tokens": {},
|
230 |
"model_max_length": 131072,
|
231 |
"pad_token": "<|endoftext|>",
|
232 |
+
"processor_class": "LLMAudioProcessor",
|
233 |
"split_special_tokens": false,
|
234 |
"tokenizer_class": "Qwen2Tokenizer",
|
235 |
"unk_token": null
|