kcz358 commited on
Commit
925ae97
·
verified ·
1 Parent(s): 36071d9

Upload processing_aero.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_aero.py +235 -0
processing_aero.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for LLaVa-Onevision.
17
+ """
18
+
19
+ import math
20
+ import os
21
+ from typing import List, Optional, Union
22
+
23
+ import numpy as np
24
+ from transformers.feature_extraction_utils import BatchFeature
25
+ from transformers.image_utils import ImageInput, VideoInput
26
+ from transformers.models.auto import AutoFeatureExtractor
27
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
28
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
29
+ from transformers.utils import logging
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ class AeroProcessorKwargs(ProcessingKwargs, total=False):
35
+ _defaults = {
36
+ "text_kwargs": {
37
+ "padding": False,
38
+ },
39
+ "audio_kwargs": {},
40
+ }
41
+
42
+
43
+ class AeroProcessor(ProcessorMixin):
44
+ attributes = ["tokenizer", "audio_processor"]
45
+ valid_kwargs = [
46
+ "chat_template",
47
+ "audio_token",
48
+ ]
49
+ tokenizer_class = "AutoTokenizer"
50
+ audio_processor_class = "AutoFeatureExtractor"
51
+
52
+ def __init__(
53
+ self,
54
+ tokenizer=None,
55
+ audio_processor=None,
56
+ chat_template=None,
57
+ audio_token="<|AUDIO|>",
58
+ **kwargs,
59
+ ):
60
+ self.audio_token = (
61
+ tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
62
+ )
63
+ if chat_template is None:
64
+ chat_template = self.default_chat_template
65
+ super().__init__(
66
+ tokenizer,
67
+ audio_processor,
68
+ chat_template=chat_template,
69
+ )
70
+
71
+ def __call__(
72
+ self,
73
+ text: Union[
74
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
75
+ ] = None,
76
+ audios: Union[np.ndarray, List[np.ndarray]] = None,
77
+ videos: VideoInput = None,
78
+ images: ImageInput = None,
79
+ sampling_rate: Optional[int] = None,
80
+ **kwargs: Unpack[AeroProcessorKwargs],
81
+ ) -> BatchFeature:
82
+ """
83
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
84
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
85
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
86
+ LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
87
+ of the above two methods for more information.
88
+
89
+ Args:
90
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
91
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
92
+ tensor. Both channels-first and channels-last formats are supported.
93
+ text (`str`, `List[str]`, `List[List[str]]`):
94
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
95
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
96
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
97
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
98
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
99
+
100
+ Returns:
101
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
102
+
103
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
104
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
105
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
106
+ `None`).
107
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
108
+ - **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
109
+ - **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
110
+ """
111
+
112
+ output_kwargs = self._merge_kwargs(
113
+ AeroProcessorKwargs,
114
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
115
+ **kwargs,
116
+ )
117
+
118
+ if isinstance(text, str):
119
+ text = [text]
120
+ elif not isinstance(text, list) and not isinstance(text[0], str):
121
+ raise ValueError(
122
+ "Invalid input text. Please provide a string, or a list of strings"
123
+ )
124
+
125
+ audio_inputs = {}
126
+
127
+ if audios is not None:
128
+ audio_inputs = self.audio_processor(
129
+ audios,
130
+ sampling_rate=sampling_rate,
131
+ return_attention_mask=True,
132
+ padding="max_length",
133
+ **kwargs,
134
+ )
135
+ audio_inputs["audio_attention_mask"] = audio_inputs.pop(
136
+ "attention_mask"
137
+ ) # rename attention_mask to prevent conflicts later on
138
+ audio_inputs["audio_values"] = audio_inputs.pop(
139
+ "input_features"
140
+ ) # rename input_features to audio_features for clarification
141
+ # Computes the output length of the convolutional layers and the output length of the audio encoder
142
+ input_lengths = (audio_inputs["audio_attention_mask"].sum(-1) - 1) // 2 + 1
143
+ num_audio_tokens = (input_lengths - 2) // 2 + 1
144
+ text = self.expand_audio_tokens(text, num_audio_tokens, self.audio_token)
145
+
146
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
147
+ return BatchFeature(data={**text_inputs, **audio_inputs})
148
+
149
+ def expand_audio_tokens(
150
+ self,
151
+ text: List[TextInput],
152
+ num_audio_tokens: List[int],
153
+ special_token: str,
154
+ ):
155
+ prompt_strings = []
156
+ current_audio_idx = 0
157
+ for sample in text:
158
+ while special_token in sample:
159
+ num_audio_token = num_audio_tokens[current_audio_idx]
160
+ sample = sample.replace(
161
+ special_token, "<placeholder>" * num_audio_token, 1
162
+ )
163
+ current_audio_idx += 1
164
+ prompt_strings.append(sample)
165
+ text = [
166
+ sample.replace("<placeholder>", special_token) for sample in prompt_strings
167
+ ]
168
+ return text
169
+
170
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
171
+ def batch_decode(self, *args, **kwargs):
172
+ """
173
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
174
+ refer to the docstring of this method for more information.
175
+ """
176
+ return self.tokenizer.batch_decode(*args, **kwargs)
177
+
178
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
179
+ def decode(self, *args, **kwargs):
180
+ """
181
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
182
+ the docstring of this method for more information.
183
+ """
184
+ return self.tokenizer.decode(*args, **kwargs)
185
+
186
+ def batch_encode(self, *args, **kwargs):
187
+ """
188
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_encode`]. Please
189
+ refer to the docstring of this method for more information.
190
+ """
191
+ return self.tokenizer.batch_encode(*args, **kwargs)
192
+
193
+ def encode(self, *args, **kwargs):
194
+ """
195
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.encode`]. Please refer to
196
+ the docstring of this method for more information.
197
+ """
198
+ return self.tokenizer.encode(*args, **kwargs)
199
+
200
+ @property
201
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
202
+ def model_input_names(self):
203
+ tokenizer_input_names = self.tokenizer.model_input_names
204
+ image_processor_input_names = self.image_processor.model_input_names
205
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
206
+
207
+ @property
208
+ def default_chat_template(self):
209
+ # fmt: off
210
+ return (
211
+ "{% set audio_count = namespace(value=0) %}"
212
+ "{% for message in messages %}"
213
+ "{% if loop.first and message['role'] != 'system' %}"
214
+ "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
215
+ "{% endif %}"
216
+ "<|im_start|>{{ message['role'] }}\n"
217
+ "{% if message['content'] is string %}"
218
+ "{{ message['content'] }}<|im_end|>\n"
219
+ "{% else %}"
220
+ "{% for content in message['content'] %}"
221
+ "{% if 'audio' in content or 'audio_url' in content %}"
222
+ "{% set audio_count.value = audio_count.value + 1 %}"
223
+ "<|AUDIO|>\n"
224
+ "{% elif 'text' in content %}"
225
+ "{{ content['text'] }}"
226
+ "{% endif %}"
227
+ "{% endfor %}"
228
+ "<|im_end|>\n"
229
+ "{% endif %}"
230
+ "{% endfor %}"
231
+ "{% if add_generation_prompt %}"
232
+ "<|im_start|>assistant\n"
233
+ "{% endif %}"
234
+ )
235
+ # fmt: on