Alibaba-NLP
/

gme-Qwen2-VL-2B-Instruct

@@ -24,9 +24,10 @@ from transformers import (
 import os
 from collections.abc import Iterable
 class GmeQwen2VLConfig(Qwen2VLConfig):
     model_type: str = "gme_qwen2_vl"
     def __init__(
         self,
         min_image_tokens: int = 256,
@@ -44,25 +45,27 @@ class GmeQwen2VLConfig(Qwen2VLConfig):
 class GmeQwen2VLForVision2Seq(PreTrainedModel):
     config_class = GmeQwen2VLConfig
     base_model_prefix: str = "base"
     def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
         super().__init__(config)
-        model_name: str = getattr(config, "_name_or_path", "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
         self.base = Qwen2VLForConditionalGeneration(config)
         self.normalize: bool = True
         min_pixels: int = config.min_image_tokens * 28 * 28
         max_pixels: int = config.max_image_tokens * 28 * 28
         self.max_length: int = config.max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
-        self.processor.tokenizer.padding_side = 'right'
         self.defualt_instruction: str = "You are a helpful assistant."
         self.sep: str = " "
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -75,13 +78,15 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         image_grid_thw: Optional[torch.LongTensor] = None,
         # video_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
-        **kwargs
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
             if pixel_values is not None:
                 pixel_values = pixel_values.type(self.base.visual.get_dtype())
-                image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             # if pixel_values_videos is not None:
@@ -101,37 +106,44 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
-        left_padding = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])  # TODO
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
-            embeddings = outputs.last_hidden_state[torch.arange(
-                batch_size, device=outputs.last_hidden_state.device
-            ), sequence_lengths]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
-    def embed(self, texts: list[str], images: list[Image.Image], is_query=True, instruction=None, **kwargs):
         self.base.to(self.device)
         # Inputs must be batched
         input_texts, input_images = list(), list()
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
                 instruction = self.defualt_instruction
-            input_str = ''
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
-                input_str += '<|vision_start|><|image_pad|><|vision_end|>'
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
-            msg = f'<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>'
             input_texts.append(msg)
         inputs = self.processor(
@@ -140,7 +152,7 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
             padding=True,
             truncation=True,
             max_length=self.max_length,
-            return_tensors='pt'
         )
         inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
         with torch.no_grad():
@@ -148,7 +160,9 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         return embeddings
     def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
-        return self.get_fused_embeddings(texts=sentences, prompt_name=prompt_name, **kwargs)
     def encode_queries(self, queries: List[str], **kwargs):
         embeddings = self.encode(queries, **kwargs)
@@ -164,7 +178,9 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
             ]
         else:
             sentences = [
-                (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
                 for doc in corpus
             ]
         embeddings = self.encode(sentences, is_query=False, **kwargs)
@@ -176,13 +192,18 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
     def get_text_embeddings(self, texts: list[str], **kwargs):
         return self.get_fused_embeddings(texts=texts, **kwargs)
-    def get_fused_embeddings(self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, **kwargs):
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
-            batch_size = kwargs.pop('batch_size', 32)
             if images is None:
                 image_loader = None
             else:
@@ -203,10 +224,18 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         all_embeddings = list()
         none_batch = [None] * batch_size
-        show_progress_bar = kwargs.pop('show_progress_bar', True)
-        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc='encode')
-        for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
-            text_batch = none_batch if texts is None else texts[n: n+batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
@@ -215,9 +244,11 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         all_embeddings = torch.cat(all_embeddings, dim=0)
         return all_embeddings
 def custom_collate_fn(batch):
     return batch
 # Utility functions (copied from your vision processing code)
 IMAGE_FACTOR: int = 28
 MIN_PIXELS: int = 4 * 28 * 28
@@ -241,7 +272,11 @@ def floor_by_factor(number: int, factor: int) -> int:
 def smart_resize(
-    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
 ) -> tuple[int, int]:
     """
     Rescales the image so that:
@@ -271,11 +306,15 @@ def smart_resize(
     return h_bar, w_bar
-def fetch_image(image: Union[str, Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
     image_obj: Optional[Image.Image] = None
     if isinstance(image, Image.Image):
         image_obj = image
-    elif isinstance(image, str) and (image.startswith("http://") or image.startswith("https://")):
         image_obj = Image.open(requests.get(image, stream=True).raw)
     elif isinstance(image, str) and image.startswith("file://"):
         image_obj = Image.open(image[7:])

 import os
 from collections.abc import Iterable
 class GmeQwen2VLConfig(Qwen2VLConfig):
     model_type: str = "gme_qwen2_vl"
     def __init__(
         self,
         min_image_tokens: int = 256,
 class GmeQwen2VLForVision2Seq(PreTrainedModel):
     config_class = GmeQwen2VLConfig
     base_model_prefix: str = "base"
     def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
         super().__init__(config)
+        model_name: str = getattr(
+            config, "_name_or_path", "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct"
+        )
         self.base = Qwen2VLForConditionalGeneration(config)
         self.normalize: bool = True
         min_pixels: int = config.min_image_tokens * 28 * 28
         max_pixels: int = config.max_image_tokens * 28 * 28
         self.max_length: int = config.max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
+        self.processor.tokenizer.padding_side = "right"
         self.defualt_instruction: str = "You are a helpful assistant."
         self.sep: str = " "
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         # video_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
             if pixel_values is not None:
                 pixel_values = pixel_values.type(self.base.visual.get_dtype())
+                image_embeds = self.base.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             # if pixel_values_videos is not None:
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+        left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0]  # TODO
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
+            embeddings = outputs.last_hidden_state[
+                torch.arange(batch_size, device=outputs.last_hidden_state.device),
+                sequence_lengths,
+            ]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
+    def embed(
+        self,
+        texts: list[str],
+        images: list[Image.Image],
+        is_query=True,
+        instruction=None,
+        **kwargs,
+    ):
         self.base.to(self.device)
         # Inputs must be batched
         input_texts, input_images = list(), list()
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
                 instruction = self.defualt_instruction
+            input_str = ""
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
+                input_str += "<|vision_start|><|image_pad|><|vision_end|>"
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
+            msg = f"<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
             input_texts.append(msg)
         inputs = self.processor(
             padding=True,
             truncation=True,
             max_length=self.max_length,
+            return_tensors="pt",
         )
         inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
         with torch.no_grad():
         return embeddings
     def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
+        return self.get_fused_embeddings(
+            texts=sentences, prompt_name=prompt_name, **kwargs
+        )
     def encode_queries(self, queries: List[str], **kwargs):
         embeddings = self.encode(queries, **kwargs)
             ]
         else:
             sentences = [
+                (doc["title"] + self.sep + doc["text"]).strip()
+                if "title" in doc
+                else doc["text"].strip()
                 for doc in corpus
             ]
         embeddings = self.encode(sentences, is_query=False, **kwargs)
     def get_text_embeddings(self, texts: list[str], **kwargs):
         return self.get_fused_embeddings(texts=texts, **kwargs)
+    def get_fused_embeddings(
+        self,
+        texts: list[str] = None,
+        images: list[Image.Image] | DataLoader = None,
+        **kwargs,
+    ):
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
+            batch_size = kwargs.pop("batch_size", 32)
             if images is None:
                 image_loader = None
             else:
         all_embeddings = list()
         none_batch = [None] * batch_size
+        show_progress_bar = kwargs.pop("show_progress_bar", True)
+        pbar = tqdm(
+            total=n_batch,
+            disable=not show_progress_bar,
+            mininterval=1,
+            miniters=10,
+            desc="encode",
+        )
+        for n, img_batch in zip(
+            range(0, n_batch * batch_size, batch_size), image_loader
+        ):
+            text_batch = none_batch if texts is None else texts[n : n + batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
         all_embeddings = torch.cat(all_embeddings, dim=0)
         return all_embeddings
 def custom_collate_fn(batch):
     return batch
 # Utility functions (copied from your vision processing code)
 IMAGE_FACTOR: int = 28
 MIN_PIXELS: int = 4 * 28 * 28
 def smart_resize(
+    height: int,
+    width: int,
+    factor: int = IMAGE_FACTOR,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
 ) -> tuple[int, int]:
     """
     Rescales the image so that:
     return h_bar, w_bar
+def fetch_image(
+    image: Union[str, Image.Image], size_factor: int = IMAGE_FACTOR
+) -> Image.Image:
     image_obj: Optional[Image.Image] = None
     if isinstance(image, Image.Image):
         image_obj = image
+    elif isinstance(image, str) and (
+        image.startswith("http://") or image.startswith("https://")
+    ):
         image_obj = Image.open(requests.get(image, stream=True).raw)
     elif isinstance(image, str) and image.startswith("file://"):
         image_obj = Image.open(image[7:])