Upload processor

Browse files

Files changed (6) hide show

chat_template.json +3 -0
llm_audio_processing.py +252 -0
preprocessor_config.json +4 -1
processor_config.json +9 -0
tokenizer.json +2 -2
tokenizer_config.json +4 -0

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

llm_audio_processing.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Qwen2Audio.
+"""
+import warnings
+from typing import Union
+import numpy as np
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils.deprecation import deprecate_kwarg
+class LLMAudioProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "audio_kwargs": {},
+    }
+class LLMAudioProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.
+    [`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
+    [`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.
+    Args:
+        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
+            The feature extractor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`Optional[str]`, *optional*):
+                The Jinja template to use for formatting the conversation. If not provided, the default chat template
+                is used.
+        audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
+            The token to use for audio tokens.
+        audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
+            The token to use for audio bos tokens.
+        audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
+            The token to use for audio eos tokens.
+    """
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "WhisperFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        feature_extractor=None,
+        tokenizer=None,
+        chat_template=None,
+        audio_token="<|AUDIO|>",
+        audio_bos_token="<|audio_bos|>",
+        audio_eos_token="<|audio_eos|>",
+    ):
+        if chat_template is None:
+            chat_template = self.default_chat_template
+        self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
+        self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
+        self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
+        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
+    @deprecate_kwarg("audios", version="4.54.0", new_name="audio")
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio: Union[np.ndarray, list[np.ndarray]] = None,
+        audios=None,  # kept for BC
+        **kwargs: Unpack[LLMAudioProcessorKwargs],
+    ) -> BatchFeature:
+        # Handle BC when user passes deprecated keyword argument
+        if audios is not None and audio is None:
+            audio = audios
+            warnings.warn(
+                "You may have used the keyword argument for the `audio` inputs. It is strongly recommended to pass inputs with keyword arguments "
+                "with keys `audio` and `text`. From transformers v4.55 `audio` will be the only acceptable keyword argument.",
+                FutureWarning,
+            )
+        if text is None:
+            raise ValueError("You need to specify `text` input to process.")
+        elif isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        output_kwargs = self._merge_kwargs(
+            LLMAudioProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if audio is not None:
+            # ensure we have as much audios as audio tokens
+            num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
+            num_audios = 1 if type(audio) is np.ndarray else len(audio)
+            if num_audio_tokens != num_audios:
+                raise ValueError(
+                    f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
+                )
+            # Some kwargs should not be changed so we can expand text with audio tokens below
+            output_kwargs["audio_kwargs"]["return_attention_mask"] = True
+            output_kwargs["audio_kwargs"]["padding"] = "max_length"
+            audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+            # rename attention_mask to prevent conflicts later on
+            audio_inputs["feature_attention_mask"] = audio_inputs.pop("attention_mask")
+            expanded_text = []
+            audio_lengths = audio_inputs["feature_attention_mask"].sum(-1).tolist()
+            for sample in text:
+                replace_str = []
+                while self.audio_token in sample:
+                    audio_length = audio_lengths.pop(0)
+                    input_length = (audio_length - 1) // 2 + 1
+                    num_audio_tokens = input_length
+                    expanded_audio_token = self.audio_token * num_audio_tokens
+                    audio_token_start_idx = sample.find(self.audio_token)
+                    audio_token_end_idx = audio_token_start_idx + len(self.audio_token)
+                    has_bos = (
+                        sample[audio_token_start_idx - len(self.audio_bos_token) : audio_token_start_idx]
+                        == self.audio_bos_token
+                    )
+                    has_eos = (
+                        sample[audio_token_end_idx : audio_token_end_idx + len(self.audio_eos_token)]
+                        == self.audio_eos_token
+                    )
+                    # Check if this audio token is surrounded by bos/eos tokens
+                    if not has_bos and not has_eos:
+                        expanded_audio_token = self.audio_bos_token + expanded_audio_token + self.audio_eos_token
+                    replace_str.append(expanded_audio_token)
+                    sample = sample.replace(self.audio_token, "<placeholder>", 1)
+                while "<placeholder>" in sample:
+                    sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
+                expanded_text.append(sample)
+            text = expanded_text
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        if audio is not None:
+            inputs.update(audio_inputs)
+        return BatchFeature(data={**inputs}, tensor_type=return_tensors)
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))
+    @property
+    # NOTE: we don't have default templates anymore, and the below is kept only because the hub config is not yet updated!
+    def default_chat_template(self):
+        """
+        This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
+        * the template will output the role of the speaker followed by the content of the message.
+        * content is a list of strings and audios.
+        * If the content element is an audio, the template will output a sequence of <|AUDIO|> tokens
+        Example:
+        ```python
+        messages = [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
+                {"type": "text", "text": "What's that sound?"},
+            ]},
+            {"role": "assistant", "content": "It is the sound of glass shattering."},
+            {"role": "user", "content": [
+                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
+                {"type": "text", "text": "How about this one?"},
+            ]},
+        ]
+        result = template.render(messages=messages, add_generation_prompt=True)
+        ```
+        """
+        # fmt: off
+        return (
+            "{% set audio_count = namespace(value=0) %}"
+            "{% for message in messages %}"
+                "{% if loop.first and message['role'] != 'system' %}"
+                    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                "{% endif %}"
+                "<|im_start|>{{ message['role'] }}\n"
+                "{% if message['content'] is string %}"
+                    "{{ message['content'] }}<|im_end|>\n"
+                "{% else %}"
+                    "{% for content in message['content'] %}"
+                        "{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' or content['type'] == 'audio' %}"
+                            "{% set audio_count.value = audio_count.value + 1 %}"
+                            "Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+                        "{% elif 'text' in content %}"
+                            "{{ content['text'] }}"
+                        "{% endif %}"
+                    "{% endfor %}"
+                    "<|im_end|>\n"
+                "{% endif %}"
+            "{% endfor %}"
+            "{% if add_generation_prompt %}"
+                "<|im_start|>assistant\n"
+            "{% endif %}"
+        )
+        # fmt: on
+__all__ = ["LLMAudioProcessor"]

preprocessor_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
   "chunk_length": 30,
   "dither": 0.0,
   "feature_extractor_type": "WhisperFeatureExtractor",
@@ -9,7 +12,7 @@
   "nb_max_frames": 3000,
   "padding_side": "right",
   "padding_value": 0.0,
-  "processor_class": "WhisperProcessor",
   "return_attention_mask": false,
   "sampling_rate": 16000
 }

 {
+  "auto_map": {
+    "AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
+  },
   "chunk_length": 30,
   "dither": 0.0,
   "feature_extractor_type": "WhisperFeatureExtractor",
   "nb_max_frames": 3000,
   "padding_side": "right",
   "padding_value": 0.0,
+  "processor_class": "LLMAudioProcessor",
   "return_attention_mask": false,
   "sampling_rate": 16000
 }

processor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "audio_bos_token": "<|audio_bos|>",
+  "audio_eos_token": "<|audio_eos|>",
+  "audio_token": "<|AUDIO|>",
+  "auto_map": {
+    "AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
+  },
+  "processor_class": "LLMAudioProcessor"
+}

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7bb5367c34e9391d39dc1028ea4347622217299c54632a9840a91c1200fa3172
-size 11422462

 version https://git-lfs.github.com/spec/v1
+oid sha256:99474fa95a1d2494b675ea499a6a15c177217449a4fe34f52ee130bfee7b723d
+size 11422630

tokenizer_config.json CHANGED Viewed

@@ -218,6 +218,9 @@
     "<|image_pad|>",
     "<|video_pad|>"
   ],
   "bos_token": null,
   "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
   "clean_up_tokenization_spaces": false,
@@ -226,6 +229,7 @@
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

     "<|image_pad|>",
     "<|video_pad|>"
   ],
+  "auto_map": {
+    "AutoProcessor": "llm_audio_processing.LLMAudioProcessor"
+  },
   "bos_token": null,
   "chat_template": "{% set audio_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if 'audio' in content or 'audio_url' in content or message['type'] == 'audio' %}{% set audio_count.value = audio_count.value + 1 %}Audio {{ audio_count.value }}: <|audio_bos|><|AUDIO|><|audio_eos|>\n{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
   "clean_up_tokenization_spaces": false,
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
+  "processor_class": "LLMAudioProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null