Malaysian-Qwen2.5-7B-Speech-Instruct / llm_audio_processing.py

Upload processor

933c9a3 verified about 2 months ago

11.4 kB

	# coding=utf-8
	# Copyright 2024 The HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Processor class for Qwen2Audio.
	"""

	import warnings
	from typing import Union

	import numpy as np

	from transformers.feature_extraction_utils import BatchFeature
	from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
	from transformers.utils.deprecation import deprecate_kwarg


	class LLMAudioProcessorKwargs(ProcessingKwargs, total=False):
	_defaults = {
	"text_kwargs": {
	"padding": False,
	},
	"audio_kwargs": {},
	}


	class LLMAudioProcessor(ProcessorMixin):
	r"""
	Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.

	[`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
	[`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.

	Args:
	feature_extractor ([`WhisperFeatureExtractor`], optional):
	The feature extractor is a required input.
	tokenizer ([`Qwen2TokenizerFast`], optional):
	The tokenizer is a required input.
	chat_template (`Optional[str]`, optional):
	The Jinja template to use for formatting the conversation. If not provided, the default chat template
	is used.
	audio_token (`str`, optional, defaults to `"<\|AUDIO\|>"`):
	The token to use for audio tokens.
	audio_bos_token (`str`, optional, defaults to `"<\|audio_bos\|>"`):
	The token to use for audio bos tokens.
	audio_eos_token (`str`, optional, defaults to `"<\|audio_eos\|>"`):
	The token to use for audio eos tokens.
	"""

	attributes = ["feature_extractor", "tokenizer"]
	feature_extractor_class = "WhisperFeatureExtractor"
	tokenizer_class = "AutoTokenizer"

	def __init__(
	self,
	feature_extractor=None,
	tokenizer=None,
	chat_template=None,
	audio_token="<\|AUDIO\|>",
	audio_bos_token="<\|audio_bos\|>",
	audio_eos_token="<\|audio_eos\|>",
	):
	if chat_template is None:
	chat_template = self.default_chat_template
	self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
	self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
	self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
	self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
	super().__init__(feature_extractor, tokenizer, chat_template=chat_template)

	@deprecate_kwarg("audios", version="4.54.0", new_name="audio")
	def __call__(
	self,
	text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
	audio: Union[np.ndarray, list[np.ndarray]] = None,
	audios=None, # kept for BC
	**kwargs: Unpack[LLMAudioProcessorKwargs],
	) -> BatchFeature:

	# Handle BC when user passes deprecated keyword argument
	if audios is not None and audio is None:
	audio = audios
	warnings.warn(
	"You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments "
	"with keys `audio` and `text`. From transformers v4.55 `audio` will be the only acceptable keyword argument.",
	FutureWarning,
	)

	if text is None:
	raise ValueError("You need to specify `text` input to process.")
	elif isinstance(text, str):
	text = [text]
	elif not isinstance(text, list) and not isinstance(text[0], str):
	raise ValueError("Invalid input text. Please provide a string, or a list of strings")

	output_kwargs = self._merge_kwargs(
	LLMAudioProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)

	if audio is not None:
	# ensure we have as much audios as audio tokens
	num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
	num_audios = 1 if type(audio) is np.ndarray else len(audio)
	if num_audio_tokens != num_audios:
	raise ValueError(
	f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
	)

	# Some kwargs should not be changed so we can expand text with audio tokens below
	output_kwargs["audio_kwargs"]["return_attention_mask"] = True
	output_kwargs["audio_kwargs"]["padding"] = "max_length"
	audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])

	# rename attention_mask to prevent conflicts later on
	audio_inputs["feature_attention_mask"] = audio_inputs.pop("attention_mask")

	expanded_text = []
	audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()

	for sample in text:
	replace_str = []
	while self.audio_token in sample:
	audio_length = audio_lengths.pop(0)
	input_length = (audio_length - 1) // 2 + 1
	num_audio_tokens = input_length

	expanded_audio_token = self.audio_token * num_audio_tokens

	audio_token_start_idx = sample.find(self.audio_token)
	audio_token_end_idx = audio_token_start_idx + len(self.audio_token)

	has_bos = (
	sample[audio_token_start_idx - len(self.audio_bos_token) : audio_token_start_idx]
	== self.audio_bos_token
	)
	has_eos = (
	sample[audio_token_end_idx : audio_token_end_idx + len(self.audio_eos_token)]
	== self.audio_eos_token
	)

	# Check if this audio token is surrounded by bos/eos tokens
	if not has_bos and not has_eos:
	expanded_audio_token = self.audio_bos_token + expanded_audio_token + self.audio_eos_token

	replace_str.append(expanded_audio_token)
	sample = sample.replace(self.audio_token, "<placeholder>", 1)

	while "<placeholder>" in sample:
	sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
	expanded_text.append(sample)
	text = expanded_text

	return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
	inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])

	if audio is not None:
	inputs.update(audio_inputs)

	return BatchFeature(data={**inputs}, tensor_type=return_tensors)

	def batch_decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
	refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
	the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	@property
	def model_input_names(self):
	tokenizer_input_names = self.tokenizer.model_input_names
	feature_extractor_input_names = self.feature_extractor.model_input_names
	return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))

	@property
	# NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
	def default_chat_template(self):
	"""
	This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
	* the template will output the role of the speaker followed by the content of the message.
	* content is a list of strings and audios.
	* If the content element is an audio, the template will output a sequence of <\|AUDIO\|> tokens

	Example:

	```python
	messages = [
	{'role': 'system', 'content': 'You are a helpful assistant.'},
	{"role": "user", "content": [
	{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
	{"type": "text", "text": "What's that sound?"},
	]},
	{"role": "assistant", "content": "It is the sound of glass shattering."},
	{"role": "user", "content": [
	{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
	{"type": "text", "text": "How about this one?"},
	]},
	]

	result = template.render(messages=messages, add_generation_prompt=True)
	```
	"""
	# fmt: off
	return (
	"{% set audio_count = namespace(value=0) %}"
	"{% for message in messages %}"
	"{% if loop.first and message['role'] != 'system' %}"
	"<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n"
	"{% endif %}"
	"<\|im_start\|>{{ message['role'] }}\n"
	"{% if message['content'] is string %}"
	"{{ message['content'] }}<\|im_end\|>\n"
	"{% else %}"
	"{% for content in message['content'] %}"
	"{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}"
	"{% set audio_count.value = audio_count.value + 1 %}"
	"Audio {{ audio_count.value }}: <\|audio_bos\|><\|AUDIO\|><\|audio_eos\|>\n"
	"{% elif 'text' in content %}"
	"{{ content['text'] }}"
	"{% endif %}"
	"{% endfor %}"
	"<\|im_end\|>\n"
	"{% endif %}"
	"{% endfor %}"
	"{% if add_generation_prompt %}"
	"<\|im_start\|>assistant\n"
	"{% endif %}"
	)
	# fmt: on


	__all__ = ["LLMAudioProcessor"]