Malaysian-Qwen2.5-7B-Speech-Instruct / llm_audio_processing.py
huseinzol05's picture
Upload processor
933c9a3 verified
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for Qwen2Audio.
"""
import warnings
from typing import Union
import numpy as np
from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.utils.deprecation import deprecate_kwarg
class LLMAudioProcessorKwargs(ProcessingKwargs, total=False):
_defaults = {
"text_kwargs": {
"padding": False,
},
"audio_kwargs": {},
}
class LLMAudioProcessor(ProcessorMixin):
r"""
Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.
[`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
[`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.
Args:
feature_extractor ([`WhisperFeatureExtractor`], *optional*):
The feature extractor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the default chat template
is used.
audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
The token to use for audio tokens.
audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
The token to use for audio bos tokens.
audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
The token to use for audio eos tokens.
"""
attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(
self,
feature_extractor=None,
tokenizer=None,
chat_template=None,
audio_token="<|AUDIO|>",
audio_bos_token="<|audio_bos|>",
audio_eos_token="<|audio_eos|>",
):
if chat_template is None:
chat_template = self.default_chat_template
self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
@deprecate_kwarg("audios", version="4.54.0", new_name="audio")
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
audio: Union[np.ndarray, list[np.ndarray]] = None,
audios=None, # kept for BC
**kwargs: Unpack[LLMAudioProcessorKwargs],
) -> BatchFeature:
# Handle BC when user passes deprecated keyword argument
if audios is not None and audio is None:
audio = audios
warnings.warn(
"You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments "
"with keys `audio` and `text`. From transformers v4.55 `audio` will be the only acceptable keyword argument.",
FutureWarning,
)
if text is None:
raise ValueError("You need to specify `text` input to process.")
elif isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
output_kwargs = self._merge_kwargs(
LLMAudioProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if audio is not None:
# ensure we have as much audios as audio tokens
num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
num_audios = 1 if type(audio) is np.ndarray else len(audio)
if num_audio_tokens != num_audios:
raise ValueError(
f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
)
# Some kwargs should not be changed so we can expand text with audio tokens below
output_kwargs["audio_kwargs"]["return_attention_mask"] = True
output_kwargs["audio_kwargs"]["padding"] = "max_length"
audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
# rename attention_mask to prevent conflicts later on
audio_inputs["feature_attention_mask"] = audio_inputs.pop("attention_mask")
expanded_text = []
audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()
for sample in text:
replace_str = []
while self.audio_token in sample:
audio_length = audio_lengths.pop(0)
input_length = (audio_length - 1) // 2 + 1
num_audio_tokens = input_length
expanded_audio_token = self.audio_token * num_audio_tokens
audio_token_start_idx = sample.find(self.audio_token)
audio_token_end_idx = audio_token_start_idx + len(self.audio_token)
has_bos = (
sample[audio_token_start_idx - len(self.audio_bos_token) : audio_token_start_idx]
== self.audio_bos_token
)
has_eos = (
sample[audio_token_end_idx : audio_token_end_idx + len(self.audio_eos_token)]
== self.audio_eos_token
)
# Check if this audio token is surrounded by bos/eos tokens
if not has_bos and not has_eos:
expanded_audio_token = self.audio_bos_token + expanded_audio_token + self.audio_eos_token
replace_str.append(expanded_audio_token)
sample = sample.replace(self.audio_token, "<placeholder>", 1)
while "<placeholder>" in sample:
sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
expanded_text.append(sample)
text = expanded_text
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
if audio is not None:
inputs.update(audio_inputs)
return BatchFeature(data={**inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))
@property
# NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
def default_chat_template(self):
"""
This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
* the template will output the role of the speaker followed by the content of the message.
* content is a list of strings and audios.
* If the content element is an audio, the template will output a sequence of <|AUDIO|> tokens
Example:
```python
messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
{"type": "text", "text": "What's that sound?"},
]},
{"role": "assistant", "content": "It is the sound of glass shattering."},
{"role": "user", "content": [
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
{"type": "text", "text": "How about this one?"},
]},
]
result = template.render(messages=messages, add_generation_prompt=True)
```
"""
# fmt: off
return (
"{% set audio_count = namespace(value=0) %}"
"{% for message in messages %}"
"{% if loop.first and message['role'] != 'system' %}"
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"{% endif %}"
"<|im_start|>{{ message['role'] }}\n"
"{% if message['content'] is string %}"
"{{ message['content'] }}<|im_end|>\n"
"{% else %}"
"{% for content in message['content'] %}"
"{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}"
"{% set audio_count.value = audio_count.value + 1 %}"
"Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
"{% elif 'text' in content %}"
"{{ content['text'] }}"
"{% endif %}"
"{% endfor %}"
"<|im_end|>\n"
"{% endif %}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"<|im_start|>assistant\n"
"{% endif %}"
)
# fmt: on
__all__ = ["LLMAudioProcessor"]