File size: 3,642 Bytes
917b255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# audio_only_processor.py

import numpy as np
from typing import List, Optional, Union
from transformers import WhisperFeatureExtractor, Qwen2TokenizerFast
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
from transformers.feature_extraction_utils import BatchFeature


class AudioOnlyProcessor(ProcessorMixin):
    """
    A processor class for AudioOnlyThinker. Handles only text + audio input (no image/video support).
    """

    feature_extractor_class = "WhisperFeatureExtractor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
    model_input_names = ["input_features", "attention_mask", "input_ids", "feature_attention_mask"]

    def __init__(self, feature_extractor=None, tokenizer=None, chat_template=None):
        self.audio_token = "<|AUDIO|>"
        self.audio_bos_token = "<|audio_bos|>"
        self.audio_eos_token = "<|audio_eos|>"
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.current_processor = self.tokenizer
        self.chat_template = chat_template

    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        audios: Union[np.ndarray, List[np.ndarray]],
        sampling_rate: Optional[int] = 16000,
        padding: Union[bool, str, PaddingStrategy] = False,
        **kwargs,
    ) -> BatchFeature:
        if not isinstance(text, list):
            text = [text]

        audios_inputs = self.feature_extractor(
            audios, sampling_rate=sampling_rate, return_attention_mask=True, padding="max_length", **kwargs
        )
        audios_inputs["feature_attention_mask"] = audios_inputs.pop("attention_mask")
        audios_inputs["input_features"] = audios_inputs.pop("input_features")

        input_lengths = (audios_inputs["feature_attention_mask"].sum(-1).numpy() - 1) // 2 + 1
        audio_lengths = (input_lengths - 2) // 2 + 1

        # Replace <|AUDIO|> token with audio_placeholder repeated by length
        for i in range(len(text)):
            text[i] = text[i].replace(
                self.audio_token,
                "<|audio_placeholder|>" * audio_lengths[0],  # assumes 1 audio per input
                1,
            )
            text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)

        text_inputs = self.tokenizer(text, padding=padding, return_tensors=kwargs.get("return_tensors", None))

        return BatchFeature(data={**text_inputs, **audios_inputs}, tensor_type=kwargs.get("return_tensors"))

    def apply_chat_template(self, conversations, chat_template=None, **kwargs):
        if isinstance(conversations[0], dict):
            conversations = [conversations]
        return self.tokenizer.apply_chat_template(conversations, chat_template=chat_template, **kwargs)

    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        tokenizer = Qwen2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
        feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)

    def save_pretrained(self, save_directory):
        self.tokenizer.save_pretrained(save_directory)
        self.feature_extractor.save_pretrained(save_directory)