Automatic Speech Recognition
Transformers
Safetensors
meralion2
meralion
meralion-2
custom_code
YingxuHe commited on
Commit
6548ee7
·
verified ·
1 Parent(s): 0ace617

Upload processor

Browse files
preprocessor_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_map": {
3
- "AutoProcessor": "processing_meralion.MERaLiONProcessor"
4
  },
5
  "chunk_length": 30,
6
  "dither": 0.0,
@@ -12,7 +12,7 @@
12
  "nb_max_frames": 3000,
13
  "padding_side": "right",
14
  "padding_value": 0.0,
15
- "processor_class": "MERaLiONProcessor",
16
  "return_attention_mask": false,
17
  "sampling_rate": 16000
18
  }
 
1
  {
2
  "auto_map": {
3
+ "AutoProcessor": "processing_meralion2.MERaLiON2Processor"
4
  },
5
  "chunk_length": 30,
6
  "dither": 0.0,
 
12
  "nb_max_frames": 3000,
13
  "padding_side": "right",
14
  "padding_value": 0.0,
15
+ "processor_class": "MERaLiON2Processor",
16
  "return_attention_mask": false,
17
  "sampling_rate": 16000
18
  }
processing_meralion2.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Processor class for MERaLiON2."""
2
+
3
+ from typing import List, Optional, Union
4
+
5
+ import numpy as np
6
+
7
+ from transformers.feature_extraction_utils import BatchFeature
8
+ from transformers.processing_utils import ProcessorMixin
9
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
10
+
11
+
12
+ # copied from transformers.models.qwen2_audio.processing_qwen2_audio.Qwen2AudioProcessor
13
+ class MERaLiON2Processor(ProcessorMixin):
14
+ r"""
15
+ Constructs a MERaLiON2 processor which wraps a whisper feature extractor and a gemma tokenizer into a single processor.
16
+
17
+ [`MERaLiON2Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`GemmaTokenizer`]. See the
18
+ [`~MERaLiON2Processor.__call__`] and [`~MERaLiON2Processor.decode`] for more information.
19
+
20
+ Args:
21
+ feature_extractor ([`WhisperFeatureExtractor`], *optional*):
22
+ The feature extractor is a required input.
23
+ tokenizer ([`GemmaTokenizer`], *optional*):
24
+ The tokenizer is a required input.
25
+ chat_template (`Optional[str]`, *optional*):
26
+ The Jinja template to use for formatting the conversation. If not provided, the default chat template
27
+ is used.
28
+ """
29
+
30
+ attributes = ["feature_extractor", "tokenizer"]
31
+ feature_extractor_class = "WhisperFeatureExtractor"
32
+ tokenizer_class = "AutoTokenizer"
33
+ valid_kwargs = [
34
+ "fixed_speech_embeds_length",
35
+ "speech_token_index",
36
+ "time_duration_limit",
37
+ "whisper_chunk_size",
38
+ "do_normalize"
39
+ ]
40
+
41
+ def __init__(
42
+ self,
43
+ feature_extractor=None,
44
+ tokenizer=None,
45
+ fixed_speech_embeds_length=100,
46
+ speech_token_index=255999,
47
+ time_duration_limit=300,
48
+ whisper_chunk_size=30,
49
+ do_normalize=True
50
+ ):
51
+ self.fixed_speech_embeds_length = fixed_speech_embeds_length
52
+ self.speech_token_index = speech_token_index
53
+ self.whisper_chunk_size = whisper_chunk_size
54
+ self.number_chunk_limit = time_duration_limit // whisper_chunk_size
55
+ self.do_normalize = do_normalize
56
+
57
+ super().__init__(feature_extractor, tokenizer)
58
+
59
+ self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
60
+ self.feature_chunk_size = self.whisper_chunk_size * self.feature_extractor.sampling_rate
61
+
62
+ def _process_text(self, text: List[str], audio_number_chunks: np.ndarray):
63
+ pieces = []
64
+ for i, item in enumerate(text):
65
+ target_string = self.speech_token * self.fixed_speech_embeds_length * audio_number_chunks[i]
66
+ pieces.append(item.replace(self.speech_token, target_string))
67
+ return pieces
68
+
69
+ def _get_number_chunks(self, audios: List[np.ndarray]):
70
+ audio_lengths = np.array([_.shape[0] for _ in audios])
71
+ number_chunks = (audio_lengths // self.feature_chunk_size) + 1
72
+ return np.clip(number_chunks, a_min=None, a_max=self.number_chunk_limit)
73
+
74
+ def _get_chunked_audios(self, audios: Union[np.ndarray, List[np.ndarray]]):
75
+ if isinstance(audios, np.ndarray):
76
+ audios = [audios]
77
+
78
+ audio_number_chunks = self._get_number_chunks(audios)
79
+ chunked_audios = []
80
+
81
+ for audio_idx, audio in enumerate(audios):
82
+ for cid in range(audio_number_chunks[audio_idx]):
83
+ chunked_audios.append(
84
+ audio[cid * self.feature_chunk_size: (cid + 1) * self.feature_chunk_size]
85
+ )
86
+ return audio_number_chunks, chunked_audios
87
+
88
+ def __call__(
89
+ self,
90
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
91
+ audios: Union[np.ndarray, List[np.ndarray]] = None,
92
+ padding: Union[bool, str, PaddingStrategy] = True,
93
+ sampling_rate: Optional[int] = None,
94
+ do_normalize: Optional[bool] = None,
95
+ **kwargs,
96
+ ) -> BatchFeature:
97
+ """
98
+ Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
99
+ and `kwargs` arguments to GemmaTokenizer's [`~GemmaTokenizer.__call__`] if `text` is not `None` to encode
100
+ the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
101
+ WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
102
+ of the above two methods for more information.
103
+
104
+ Args:
105
+ text (`str`, `List[str]`):
106
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
107
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
108
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
109
+ audios (`np.ndarray`, `List[np.ndarray]`):
110
+ The audio or batch of audios to be prepared. Each audio can be a NumPy array.
111
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
112
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
113
+ index) among:
114
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
115
+ sequence if provided).
116
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
117
+ acceptable input length for the model if that argument is not provided.
118
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
119
+ lengths).
120
+ sampling_rate (`int`, defaults to 16000):
121
+ The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
122
+ do_normalize (`bool`, defaults to `True`):
123
+ Whether or not to zero-mean unit-variance normalize the input.
124
+ Normalizing can help to significantly improve the performance of the model.
125
+ """
126
+
127
+ if text is None:
128
+ raise ValueError("You need to specify either a `text` input to process.")
129
+ if not isinstance(text, list):
130
+ text = [text]
131
+ if not isinstance(audios, list):
132
+ audios = [audios]
133
+ if sampling_rate is None:
134
+ sampling_rate = self.feature_extractor.sampling_rate
135
+ if do_normalize is None:
136
+ do_normalize = self.do_normalize
137
+
138
+ for i, audio in enumerate(audios):
139
+ if audio.ndim > 1:
140
+ raise Exception(f"MERaLiON2 only accepts mono channel audio, {i+1}th audio have {audios[0].ndim} channels")
141
+
142
+ inputs_dict = {}
143
+
144
+ if audios is not None:
145
+ audio_number_chunks, chunked_audios = self._get_chunked_audios(audios)
146
+ text = self._process_text(text, audio_number_chunks)
147
+
148
+ audio_inputs = self.feature_extractor(
149
+ chunked_audios,
150
+ sampling_rate=sampling_rate,
151
+ return_tensors="pt",
152
+ return_attention_mask=True,
153
+ padding="max_length",
154
+ do_normalize=self.do_normalize,
155
+ **kwargs
156
+ )
157
+ audio_inputs["feature_attention_mask"] = audio_inputs.pop(
158
+ "attention_mask"
159
+ ) # rename attention_mask to prevent conflicts later on
160
+ inputs_dict.update(audio_inputs)
161
+
162
+ text_input = self.tokenizer(
163
+ text=text,
164
+ return_tensors="pt",
165
+ add_special_tokens=False,
166
+ return_attention_mask=True,
167
+ padding=padding,
168
+ **kwargs
169
+ )
170
+
171
+ inputs_dict["input_ids"] = text_input.input_ids
172
+ inputs_dict["attention_mask"] = text_input.attention_mask
173
+
174
+ return BatchFeature(data={**inputs_dict})
175
+
176
+ def batch_decode(self, *args, **kwargs):
177
+ """
178
+ This method forwards all its arguments to GemmaTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
179
+ refer to the docstring of this method for more information.
180
+ """
181
+ return self.tokenizer.batch_decode(*args, **kwargs)
182
+
183
+ def decode(self, *args, **kwargs):
184
+ """
185
+ This method forwards all its arguments to GemmaTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
186
+ the docstring of this method for more information.
187
+ """
188
+ return self.tokenizer.decode(*args, **kwargs)
189
+
190
+ @property
191
+ def model_input_names(self):
192
+ tokenizer_input_names = self.tokenizer.model_input_names
193
+ feature_extractor_input_names = self.feature_extractor.model_input_names
194
+ return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"]))
processor_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "auto_map": {
3
- "AutoProcessor": "processing_meralion.MERaLiONProcessor"
4
  },
5
  "do_normalize": true,
6
  "fixed_speech_embeds_length": 100,
7
- "processor_class": "MERaLiONProcessor",
8
  "speech_token_index": 255999,
9
  "whisper_chunk_size": 30
10
  }
 
1
  {
2
  "auto_map": {
3
+ "AutoProcessor": "processing_meralion2.MERaLiON2Processor"
4
  },
5
  "do_normalize": true,
6
  "fixed_speech_embeds_length": 100,
7
+ "processor_class": "MERaLiON2Processor",
8
  "speech_token_index": 255999,
9
  "whisper_chunk_size": 30
10
  }
tokenizer_config.json CHANGED
@@ -2000,7 +2000,7 @@
2000
  "<end_of_turn>"
2001
  ],
2002
  "auto_map": {
2003
- "AutoProcessor": "processing_meralion.MERaLiONProcessor"
2004
  },
2005
  "bos_token": "<bos>",
2006
  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
@@ -2010,7 +2010,7 @@
2010
  "model_max_length": 1000000000000000019884624838656,
2011
  "pad_token": "<pad>",
2012
  "padding_side": "left",
2013
- "processor_class": "MERaLiONProcessor",
2014
  "sp_model_kwargs": {},
2015
  "spaces_between_special_tokens": false,
2016
  "tokenizer_class": "GemmaTokenizer",
 
2000
  "<end_of_turn>"
2001
  ],
2002
  "auto_map": {
2003
+ "AutoProcessor": "processing_meralion2.MERaLiON2Processor"
2004
  },
2005
  "bos_token": "<bos>",
2006
  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
 
2010
  "model_max_length": 1000000000000000019884624838656,
2011
  "pad_token": "<pad>",
2012
  "padding_side": "left",
2013
+ "processor_class": "MERaLiON2Processor",
2014
  "sp_model_kwargs": {},
2015
  "spaces_between_special_tokens": false,
2016
  "tokenizer_class": "GemmaTokenizer",