Alibaba-NLP
/

gme-Qwen2-VL-2B-Instruct

@@ -1,70 +1,44 @@
 from __future__ import annotations
-import base64
 import logging
 import math
 import os
-from io import BytesIO
-from typing import Any, Dict, List, Optional, Union
-import requests
 import torch
 from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForVision2Seq,
-    AutoProcessor,
-    PreTrainedModel,
-    Qwen2VLConfig,
-    Qwen2VLForConditionalGeneration,
-)
-import os
-from collections.abc import Iterable
-class GmeQwen2VLConfig(Qwen2VLConfig):
-    model_type: str = "gme_qwen2_vl"
     def __init__(
         self,
-        min_image_tokens: int = 256,
-        max_image_tokens: int = 1280,
-        max_length: int = 1800,
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs: Any,
     ) -> None:
-        super().__init__(**kwargs)
-        self.min_image_tokens = min_image_tokens
-        self.max_image_tokens = max_image_tokens
-        self.max_length = max_length
-class GmeQwen2VLForVision2Seq(PreTrainedModel):
-    config_class = GmeQwen2VLConfig
-    base_model_prefix: str = "base"
-    def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
-        super().__init__(config)
-        model_name: str = getattr(
-            config, "_name_or_path", "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct"
         )
-        self.base = Qwen2VLForConditionalGeneration(config)
-        self.normalize: bool = True
-        min_pixels: int = config.min_image_tokens * 28 * 28
-        max_pixels: int = config.max_image_tokens * 28 * 28
-        self.max_length: int = config.max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
-        self.processor.tokenizer.padding_side = "right"
-        self.defualt_instruction: str = "You are a helpful assistant."
-        self.sep: str = " "
     def forward(
         self,
@@ -78,15 +52,13 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         image_grid_thw: Optional[torch.LongTensor] = None,
         # video_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
-        **kwargs,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
             if pixel_values is not None:
                 pixel_values = pixel_values.type(self.base.visual.get_dtype())
-                image_embeds = self.base.visual(
-                    pixel_values, grid_thw=image_grid_thw
-                ).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             # if pixel_values_videos is not None:
@@ -106,44 +78,36 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
-        left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0]  # TODO
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
-            embeddings = outputs.last_hidden_state[
-                torch.arange(batch_size, device=outputs.last_hidden_state.device),
-                sequence_lengths,
-            ]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
-    def embed(
-        self,
-        texts: list[str],
-        images: list[Image.Image],
-        is_query=True,
-        instruction=None,
-        **kwargs,
-    ):
         self.base.to(self.device)
         # Inputs must be batched
         input_texts, input_images = list(), list()
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
-                instruction = self.defualt_instruction
-            input_str = ""
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
-                input_str += "<|vision_start|><|image_pad|><|vision_end|>"
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
-            msg = f"<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
             input_texts.append(msg)
         inputs = self.processor(
@@ -152,7 +116,7 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
             padding=True,
             truncation=True,
             max_length=self.max_length,
-            return_tensors="pt",
         )
         inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
         with torch.no_grad():
@@ -160,9 +124,7 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         return embeddings
     def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
-        return self.get_fused_embeddings(
-            texts=sentences, prompt_name=prompt_name, **kwargs
-        )
     def encode_queries(self, queries: List[str], **kwargs):
         embeddings = self.encode(queries, **kwargs)
@@ -178,9 +140,7 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
             ]
         else:
             sentences = [
-                (doc["title"] + self.sep + doc["text"]).strip()
-                if "title" in doc
-                else doc["text"].strip()
                 for doc in corpus
             ]
         embeddings = self.encode(sentences, is_query=False, **kwargs)
@@ -192,18 +152,13 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
     def get_text_embeddings(self, texts: list[str], **kwargs):
         return self.get_fused_embeddings(texts=texts, **kwargs)
-    def get_fused_embeddings(
-        self,
-        texts: list[str] = None,
-        images: list[Image.Image] | DataLoader = None,
-        **kwargs,
-    ):
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
-            batch_size = kwargs.pop("batch_size", 32)
             if images is None:
                 image_loader = None
             else:
@@ -224,18 +179,10 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         all_embeddings = list()
         none_batch = [None] * batch_size
-        show_progress_bar = kwargs.pop("show_progress_bar", True)
-        pbar = tqdm(
-            total=n_batch,
-            disable=not show_progress_bar,
-            mininterval=1,
-            miniters=10,
-            desc="encode",
-        )
-        for n, img_batch in zip(
-            range(0, n_batch * batch_size, batch_size), image_loader
-        ):
-            text_batch = none_batch if texts is None else texts[n : n + batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
@@ -249,11 +196,15 @@ def custom_collate_fn(batch):
     return batch
-# Utility functions (copied from your vision processing code)
-IMAGE_FACTOR: int = 28
-MIN_PIXELS: int = 4 * 28 * 28
-MAX_PIXELS: int = 16384 * 28 * 28
-MAX_RATIO: int = 200
 def round_by_factor(number: int, factor: int) -> int:
@@ -272,17 +223,16 @@ def floor_by_factor(number: int, factor: int) -> int:
 def smart_resize(
-    height: int,
-    width: int,
-    factor: int = IMAGE_FACTOR,
-    min_pixels: int = MIN_PIXELS,
-    max_pixels: int = MAX_PIXELS,
 ) -> tuple[int, int]:
     """
-    Rescales the image so that:
-      1. Both dimensions are divisible by 'factor'.
-      2. Total pixels fall between ['min_pixels', 'max_pixels'].
-      3. Aspect ratio is maintained as closely as possible.
     """
     h_bar = max(factor, round_by_factor(height, factor))
     w_bar = max(factor, round_by_factor(width, factor))
@@ -306,31 +256,35 @@ def smart_resize(
     return h_bar, w_bar
-def fetch_image(
-    image: Union[str, Image.Image], size_factor: int = IMAGE_FACTOR
-) -> Image.Image:
-    image_obj: Optional[Image.Image] = None
     if isinstance(image, Image.Image):
         image_obj = image
-    elif isinstance(image, str) and (
-        image.startswith("http://") or image.startswith("https://")
-    ):
         image_obj = Image.open(requests.get(image, stream=True).raw)
-    elif isinstance(image, str) and image.startswith("file://"):
         image_obj = Image.open(image[7:])
-    elif isinstance(image, str) and image.startswith("data:image"):
         if "base64," in image:
             _, base64_data = image.split("base64,", 1)
             data = base64.b64decode(base64_data)
             image_obj = Image.open(BytesIO(data))
-    elif isinstance(image, str):
         image_obj = Image.open(image)
     if image_obj is None:
-        raise ValueError(
-            f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
-        )
     image = image_obj.convert("RGB")
     width, height = image.size
     resized_height, resized_width = smart_resize(
         height,
         width,
@@ -339,4 +293,37 @@ def fetch_image(
         max_pixels=MAX_PIXELS,
     )
     image = image.resize((resized_width, resized_height))
     return image

 from __future__ import annotations
 import logging
 import math
 import os
+from typing import Dict, List, Optional
 import torch
 from PIL import Image
 from torch.utils.data import DataLoader
 from tqdm.autonotebook import tqdm
+from transformers import AutoModelForVision2Seq, AutoProcessor
+class GmeQwen2VL:
     def __init__(
         self,
+        model_name: str = "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
+        model_path: Optional[str] = None,
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        min_image_tokens=256,
+        max_image_tokens=1280,
+        max_length=1800,
+        **kwargs,
     ) -> None:
+        model_name = model_path or model_name
+        self.base = AutoModelForVision2Seq.from_pretrained(
+            model_name, torch_dtype=torch.float16, **kwargs
         )
+        self.base.eval()
+        self.normalize = True
+        self.device = device
+        min_pixels = min_image_tokens * 28 * 28
+        max_pixels = max_image_tokens * 28 * 28
+        self.max_length = max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
+        self.processor.tokenizer.padding_side = 'right'
+        self.default_instruction = 'You are a helpful assistant.'
+        self.sep = ' '
     def forward(
         self,
         image_grid_thw: Optional[torch.LongTensor] = None,
         # video_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
             if pixel_values is not None:
                 pixel_values = pixel_values.type(self.base.visual.get_dtype())
+                image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             # if pixel_values_videos is not None:
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+        left_padding = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])  # TODO
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
+            embeddings = outputs.last_hidden_state[torch.arange(
+                batch_size, device=outputs.last_hidden_state.device
+            ), sequence_lengths]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
+    def embed(self, texts: list[str], images: list[Image.Image], is_query=True, instruction=None, **kwargs):
         self.base.to(self.device)
         # Inputs must be batched
         input_texts, input_images = list(), list()
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
+                instruction = self.default_instruction
+            input_str = ''
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
+                input_str += '<|vision_start|><|image_pad|><|vision_end|>'
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
+            msg = f'<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>'
             input_texts.append(msg)
         inputs = self.processor(
             padding=True,
             truncation=True,
             max_length=self.max_length,
+            return_tensors='pt'
         )
         inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
         with torch.no_grad():
         return embeddings
     def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
+        return self.get_fused_embeddings(texts=sentences, prompt_name=prompt_name, **kwargs)
     def encode_queries(self, queries: List[str], **kwargs):
         embeddings = self.encode(queries, **kwargs)
             ]
         else:
             sentences = [
+                (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
                 for doc in corpus
             ]
         embeddings = self.encode(sentences, is_query=False, **kwargs)
     def get_text_embeddings(self, texts: list[str], **kwargs):
         return self.get_fused_embeddings(texts=texts, **kwargs)
+    def get_fused_embeddings(self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, **kwargs):
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
+            batch_size = kwargs.pop('batch_size', 32)
             if images is None:
                 image_loader = None
             else:
         all_embeddings = list()
         none_batch = [None] * batch_size
+        show_progress_bar = kwargs.pop('show_progress_bar', True)
+        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc='encode')
+        for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
+            text_batch = none_batch if texts is None else texts[n: n+batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
     return batch
+### Copied from qwen_vl_utils.vision_process.py
+import base64
+from io import BytesIO
+import requests
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
 def round_by_factor(number: int, factor: int) -> int:
 def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
 ) -> tuple[int, int]:
     """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
     """
     h_bar = max(factor, round_by_factor(height, factor))
     w_bar = max(factor, round_by_factor(width, factor))
     return h_bar, w_bar
+def fetch_image(image: str | Image.Image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    image_obj = None
     if isinstance(image, Image.Image):
         image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
         image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif image.startswith("file://"):
         image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
         if "base64," in image:
             _, base64_data = image.split("base64,", 1)
             data = base64.b64decode(base64_data)
             image_obj = Image.open(BytesIO(data))
+    else:
         image_obj = Image.open(image)
     if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
     image = image_obj.convert("RGB")
+    ## resize
+    # if "resized_height" in ele and "resized_width" in ele:
+    #     resized_height, resized_width = smart_resize(
+    #         ele["resized_height"],
+    #         ele["resized_width"],
+    #         factor=size_factor,
+    #     )
+    # else:
     width, height = image.size
+    # min_pixels = ele.get("min_pixels", MIN_PIXELS)
+    # max_pixels = ele.get("max_pixels", MAX_PIXELS)
     resized_height, resized_width = smart_resize(
         height,
         width,
         max_pixels=MAX_PIXELS,
     )
     image = image.resize((resized_width, resized_height))
     return image
+###
+if __name__ == '__main__':
+    texts = [
+        "What kind of car is this?",
+        "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023."
+    ]
+    images = [
+        'https://upload.wikimedia.org/wikipedia/commons/e/e9/Tesla_Cybertruck_damaged_window.jpg',
+        'https://upload.wikimedia.org/wikipedia/commons/9/95/2024_Tesla_Cybertruck_Foundation_Series%2C_front_left_%28Greenwich%29.jpg',
+    ]
+    gme = GmeQwen2VL("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
+    # Single-modal embedding
+    e_text = gme.get_text_embeddings(texts=texts)
+    e_image = gme.get_image_embeddings(images=images)
+    print((e_text * e_image).sum(-1))
+    ## tensor([0.2281, 0.6001], dtype=torch.float16)
+    # How to set embedding instruction
+    e_query = gme.get_text_embeddings(texts=texts, instruction='Find an image that matches the given text.')
+    # If is_query=False, we always use the default instruction.
+    e_corpus = gme.get_image_embeddings(images=images, is_query=False)
+    print((e_query * e_corpus).sum(-1))
+    ## tensor([0.2433, 0.7051], dtype=torch.float16)
+    # Fused-modal embedding
+    e_fused = gme.get_fused_embeddings(texts=texts, images=images)
+    print((e_fused[0] * e_fused[1]).sum())
+    ## tensor(0.6108, dtype=torch.float16)