File size: 3,059 Bytes

from transformers import ProcessorMixin, AutoProcessor
from transformers.models.auto.processing_auto import AutoProcessor
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import BatchEncoding
import json
import os

class FlamingoProcessor(ProcessorMixin):
    """
    Custom processor that combines a tokenizer and feature extractor.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"
    
    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)
    
    def __call__(self, text=None, images=None, **kwargs):
        """
        Main processing method that handles both text and images.
        
        Args:
            text: Text input(s) to tokenize
            images: Image input(s) to process
            **kwargs: Additional arguments passed to tokenizer/image_processor
        
        Returns:
            Dictionary with processed inputs
        """
        if text is None and images is None:
            raise ValueError("You need to specify either text or images")
        
        encoding = {}
        
        # Process text if provided
        if text is not None:
            if type(text) == str:
                all_text = "<image> " + text
            else:
                if type(text[0]) == str:
                    all_text = ["<image> " + _text for _text in text]
                else:
                    all_text = ['<image> ' + " ".join(_text) for _text in text]
            text_encoding = self.tokenizer(all_text, **kwargs)

            if 'offset_mapping' in text_encoding:
                offset_mapping = text_encoding['offset_mapping']
                if type(offset_mapping) != list:
                    offset_mapping = offset_mapping[0].tolist()
                true_offset = offset_mapping[0][-1]
                new_offsets = []
                for start, end in offset_mapping:
                    if start == 0:
                        new_offsets.append((0, 0))
                    else:
                        new_offsets.append((start - true_offset, end - true_offset))
                text_encoding['offset_mapping'] = new_offsets

            encoding.update(text_encoding)
        
        # Process images if provided
        if images is not None:
            image_encoding = self.image_processor(images, **kwargs)
            # Add prefix to avoid key conflicts
            for key, value in image_encoding.items():
                encoding[f"pixel_values" if key == "pixel_values" else f"image_{key}"] = value
        
        return BatchEncoding(encoding)
    
    def batch_decode(self, *args, **kwargs):
        """
        Delegate batch decoding to the tokenizer.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
    
    def decode(self, *args, **kwargs):
        """
        Delegate decoding to the tokenizer.
        """
        return self.tokenizer.decode(*args, **kwargs)