from transformers import ProcessorMixin, AutoProcessor from transformers.models.auto.processing_auto import AutoProcessor from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import BatchEncoding import json import os class FlamingoProcessor(ProcessorMixin): """ Custom processor that combines a tokenizer and feature extractor. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) def __call__(self, text=None, images=None, **kwargs): """ Main processing method that handles both text and images. Args: text: Text input(s) to tokenize images: Image input(s) to process **kwargs: Additional arguments passed to tokenizer/image_processor Returns: Dictionary with processed inputs """ if text is None and images is None: raise ValueError("You need to specify either text or images") encoding = {} # Process text if provided if text is not None: if type(text) == str: all_text = " " + text else: if type(text[0]) == str: all_text = [" " + _text for _text in text] else: all_text = [' ' + " ".join(_text) for _text in text] text_encoding = self.tokenizer(all_text, **kwargs) if 'offset_mapping' in text_encoding: offset_mapping = text_encoding['offset_mapping'] if type(offset_mapping) != list: offset_mapping = offset_mapping[0].tolist() true_offset = offset_mapping[0][-1] new_offsets = [] for start, end in offset_mapping: if start == 0: new_offsets.append((0, 0)) else: new_offsets.append((start - true_offset, end - true_offset)) text_encoding['offset_mapping'] = new_offsets encoding.update(text_encoding) # Process images if provided if images is not None: image_encoding = self.image_processor(images, **kwargs) # Add prefix to avoid key conflicts for key, value in image_encoding.items(): encoding[f"pixel_values" if key == "pixel_values" else f"image_{key}"] = value return BatchEncoding(encoding) def batch_decode(self, *args, **kwargs): """ Delegate batch decoding to the tokenizer. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """ Delegate decoding to the tokenizer. """ return self.tokenizer.decode(*args, **kwargs)