BabyLM-community
/

babylm-multimodal-baseline-flamingo

+from transformers import ProcessorMixin, AutoProcessor
+from transformers.models.auto.processing_auto import AutoProcessor
+from transformers.processing_utils import ProcessorMixin
+import json
+import os
+class FlamingoProcessor(ProcessorMixin):
+    """
+    Custom processor that combines a tokenizer and feature extractor.
+    """
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+    def __call__(self, text=None, images=None, **kwargs):
+        """
+        Main processing method that handles both text and images.
+        Args:
+            text: Text input(s) to tokenize
+            images: Image input(s) to process
+            **kwargs: Additional arguments passed to tokenizer/feature_extractor
+        Returns:
+            Dictionary with processed inputs
+        """
+        if text is None and images is None:
+            raise ValueError("You need to specify either text or images")
+        encoding = {}
+        # Process text if provided
+        if text is not None:
+            if type(text) == str:
+                all_text = "<image> " + text
+            else:
+                all_text = ["<image> " + _text for _text in text]
+            text_encoding = self.tokenizer(all_text, **kwargs)
+            encoding.update(text_encoding)
+        # Process images if provided
+        if images is not None:
+            image_encoding = self.feature_extractor(images, **kwargs)
+            # Add prefix to avoid key conflicts
+            for key, value in image_encoding.items():
+                encoding[f"pixel_values" if key == "pixel_values" else f"image_{key}"] = value
+        return encoding
+    def batch_decode(self, *args, **kwargs):
+        """
+        Delegate batch decoding to the tokenizer.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        Delegate decoding to the tokenizer.
+        """
+        return self.tokenizer.decode(*args, **kwargs)