jinaai
/

jina-embeddings-v4

@@ -1,24 +1,84 @@
 # Jina Embeddings V4
-Load the model:
 ```python
 from transformers import AutoModel
 model = AutoModel.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
-```
-Encode Text:
-```python
-text_embedding = model.encode_texts(['test'])
 ```
-Encode Image (very slow on CPU):
 ```python
 from PIL import Image
-img = Image.open('path/to/your/image.png')
-image_embedding = m.encode_images([img])
 ```

 # Jina Embeddings V4
+## Examples
+Encode functions:
 ```python
+import torch
 from transformers import AutoModel
+from PIL import Image
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load model
 model = AutoModel.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
+model = model.to(device)
+# Sample data
+texts = ["Here is some sample code", "This is a matching text"]
+image_paths = ['/<path_to_image>']
+images = [Image.open(path) for path in image_paths]
+# Example 1: Text matching task with single vector embeddings
+model.set_task(task='text-matching')
+# Generate embeddings with dimension truncation (256)
+img_embeddings = model.encode_images(images=images, truncate_dim=256)
+text_embeddings = model.encode_texts(texts=texts, truncate_dim=256, max_length=512)
+# Example 2: Retrieval task with multi-vector embeddings
+model.set_task(task='retrieval')
+# Generate multi-vector embeddings
+img_embeddings = model.encode_images(images=images, vector_type='multi_vector')
+text_embeddings = model.encode_texts(texts=texts, vector_type='multi_vector', text_type='passage')
+# Example 3: Code task with single vector embeddings
+model.set_task(task='code')
+code = ["def hello_world():\n    print('Hello, World!')"]
+code_embeddings = model.encode_texts(texts=code)
 ```
+Using the model forward:
 ```python
+import torch
+from transformers import AutoModel, AutoProcessor
 from PIL import Image
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load model and processor
+model = AutoModel.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
+model = model.to(device)
+processor = AutoProcessor.from_pretrained('jinaai/jina-embeddings-v4', trust_remote_code=True)
+# Sample data
+texts = ["Here is some sample code", "This is a matching text"]
+image_paths = ['/<path_to_image>']
+# Process text and images
+text_batch = processor.process_texts(texts=texts, prefix="Query", max_length=512)
+images = [Image.open(path) for path in image_paths]
+image_batch = processor.process_images(images=images)
+# Forward pass
+model.eval()
+with torch.no_grad():
+    text_batch = {k: v.to(device) for k, v in text_batch.items()}
+    image_batch = {k: v.to(device) for k, v in image_batch.items()}
+    with torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
+        # Get embeddings
+        text_embeddings = model.model(**text_batch).single_vec_emb
+        img_embeddings = model.model(**image_batch).single_vec_emb
 ```

config.json CHANGED Viewed

@@ -53,5 +53,6 @@
   "vision_end_token_id": 151653,
   "vision_start_token_id": 151652,
   "vision_token_id": 151654,
-  "vocab_size": 151936
 }

   "vision_end_token_id": 151653,
   "vision_start_token_id": 151652,
   "vision_token_id": 151654,
+  "vocab_size": 151936,
+  "truncate_dim": null
 }

modeling_jina_embeddings_v4.py CHANGED Viewed

@@ -1,4 +1,6 @@
-import math
 import os
 from dataclasses import dataclass
 from enum import Enum
@@ -15,7 +17,6 @@ from torch import nn
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import BatchFeature
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.qwen2_5_vl import (Qwen2_5_VLForConditionalGeneration,
                                             Qwen2_5_VLProcessor)
@@ -33,27 +34,17 @@ class TaskType(str, Enum):
     text_matching = "text-matching"
 class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
     def __init__(self, *args, **kwargs) -> None:
         Qwen2_5_VLProcessor.__init__(self, *args, **kwargs)
         self.assistant_prefix_len = 58
         self.text_max_length = 8192
-    @staticmethod
-    def round_by_factor(number: float, factor: int) -> int:
-        """Returns the closest integer to 'number' that is divisible by 'factor'."""
-        return round(number / factor) * factor
-    @staticmethod
-    def ceil_by_factor(number: float, factor: int) -> int:
-        """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-        return math.ceil(number / factor) * factor
-    @staticmethod
-    def floor_by_factor(number: float, factor: int) -> int:
-        """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-        return math.floor(number / factor) * factor
     def process_images(
         self,
         images: Union[List[Image.Image], List[List[Image.Image]]],
@@ -175,7 +166,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
                 [pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0
             )
-        position_ids, rope_deltas = super().get_rope_index(  # type: ignore
             input_ids=input_ids,
             image_grid_thw=kwargs.get("image_grid_thw", None),
             attention_mask=attention_mask,
@@ -267,10 +258,10 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         **kwargs,
     ) -> JinaEmbeddingsV4ModelOutput:
         """
-        Forward pass through QwenVL25Embeddings. Returns both single-vector and multi-vector embeddings.
         Args:
-            input_ids (torch.LongTensor): The input tokens tensor.
-            attention_mask (torch.LongTensor): The attention mask tensor.
         Returns:
             JinaEmbeddingsV4ModelOutput:
                 single_vector (torch.Tensor): Single-vector embeddings of shape (batch_size, dim).
@@ -302,17 +293,17 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         data: List[Union[str, Image.Image]],
         processor_fn: Callable,
         desc: str,
-        vector_type: Optional[str] = None,
         return_numpy: bool = False,
-        **kwargs,
     ) -> Union[np.ndarray, List[torch.Tensor]]:
         dataloader = DataLoader(
             dataset=data,
-            batch_size=kwargs.get("batch_size", 32),
             shuffle=False,
             collate_fn=processor_fn,
         )
-        vector_type = vector_type or "single_vector"
         results = []
         self.eval()
         for batch in tqdm(dataloader, desc=desc):
@@ -322,8 +313,11 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
                     embeddings = self(**batch)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
                     else:
                         embeddings = embeddings.multi_vec_emb
                     results.append(
                         embeddings.cpu()
                         if return_numpy
@@ -333,44 +327,98 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             return np.concatenate([result.numpy() for result in results], axis=0)
         return [item for sublist in results for item in sublist]
     def encode_texts(
         self,
-        queries: List[str],
         max_length: int = 8192,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
-        desc: Optional[str] = None,
-        **kwargs,
     ) -> List[torch.Tensor]:
         processor_fn = partial(
-            self.processor.process_texts, max_length=max_length, prefix="Query"
         )
-        return self._process_batches(
-            data=queries,
             processor_fn=processor_fn,
-            desc=desc or "Encode queries...",
-            vector_type=vector_type,
             batch_size=batch_size,
-            **kwargs,
         )
     def encode_images(
         self,
-        documents: List[Image.Image],
         batch_size: int = 8,
         vector_type: Optional[str] = None,
-        desc: Optional[str] = None,
-        **kwargs,
     ) -> List[torch.Tensor]:
-        return self._process_batches(
-            data=documents,
             processor_fn=self.processor.process_images,
-            desc=desc or "Encode documents...",
-            vector_type=vector_type,
             batch_size=batch_size,
-            **kwargs,
         )
     @classmethod
     def from_pretrained(
         cls,
@@ -381,9 +429,15 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
-        task = kwargs.pop("task", TaskType.retrieval)
-        # Get the base model first
         base_model = super().from_pretrained(
             pretrained_model_name_or_path, *args, **kwargs
         )
@@ -397,36 +451,44 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
             )
             adapter_dir = os.path.join(adapter_cache_path, "adapters")
-        # Store adapter directory for later use with set_task
         base_model.adapter_dir = adapter_dir
         # Create the PEFT model with the requested task adapter
         peft_model = PeftModel.from_pretrained(
-            base_model, os.path.join(adapter_dir, task)
         )
         # Add set_task method to the PEFT model instance
-        def set_task_method(self, task_name: Union[str, TaskType]):
             """
             Set the task adapter for the model.
             Args:
-                task_name (Union[str, TaskType]): The task name. Must be one of TaskType values or
                                                   one of ['retrieval', 'text-matching', 'code']
             """
-            if isinstance(task_name, str):
                 try:
-                    task_name = TaskType(task_name)
                 except ValueError:
                     valid_tasks = [t.value for t in TaskType]
                     raise ValueError(
-                        f"Invalid task: {task_name}. Must be one of {valid_tasks}"
                     )
-            adapter_path = os.path.join(self.adapter_dir, task_name.value)
-            hotswap_adapter(self, adapter_path, adapter_name="default")
-        # Bind the method to the instance
         peft_model.set_task = set_task_method.__get__(peft_model, type(peft_model))
         return peft_model

+# Jina Embeddings V4 Model implementation was inspired by the ColPali codebase:
+# https://github.com/illuin-tech/colpali
 import os
 from dataclasses import dataclass
 from enum import Enum
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import BatchFeature
 from transformers.models.qwen2_5_vl import (Qwen2_5_VLForConditionalGeneration,
                                             Qwen2_5_VLProcessor)
     text_matching = "text-matching"
+PREFIX_DICT = {"query": "Query", "passage": "Passage"}
+TRUNCATE_DIMS = [128, 256, 512, 1024]
+VECTOR_TYPES = ["single_vector", "multi_vector"]
 class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
     def __init__(self, *args, **kwargs) -> None:
         Qwen2_5_VLProcessor.__init__(self, *args, **kwargs)
         self.assistant_prefix_len = 58
         self.text_max_length = 8192
     def process_images(
         self,
         images: Union[List[Image.Image], List[List[Image.Image]]],
                 [pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0
             )
+        position_ids, rope_deltas = super().get_rope_index(
             input_ids=input_ids,
             image_grid_thw=kwargs.get("image_grid_thw", None),
             attention_mask=attention_mask,
         **kwargs,
     ) -> JinaEmbeddingsV4ModelOutput:
         """
+        Forward pass through the model. Returns both single-vector and multi-vector embeddings.
         Args:
+            input_ids (torch.Tensor): The input tokens tensor.
+            attention_mask (torch.Tensor): The attention mask tensor.
         Returns:
             JinaEmbeddingsV4ModelOutput:
                 single_vector (torch.Tensor): Single-vector embeddings of shape (batch_size, dim).
         data: List[Union[str, Image.Image]],
         processor_fn: Callable,
         desc: str,
+        vector_type: str = "single_vector",
         return_numpy: bool = False,
+        batch_size: int = 32,
+        truncate_dim: Optional[int] = None,
     ) -> Union[np.ndarray, List[torch.Tensor]]:
         dataloader = DataLoader(
             dataset=data,
+            batch_size=batch_size,
             shuffle=False,
             collate_fn=processor_fn,
         )
         results = []
         self.eval()
         for batch in tqdm(dataloader, desc=desc):
                     embeddings = self(**batch)
                     if vector_type == "single_vector":
                         embeddings = embeddings.single_vec_emb
+                        if truncate_dim is not None:
+                            embeddings = embeddings[:, :truncate_dim]
                     else:
                         embeddings = embeddings.multi_vec_emb
                     results.append(
                         embeddings.cpu()
                         if return_numpy
             return np.concatenate([result.numpy() for result in results], axis=0)
         return [item for sublist in results for item in sublist]
+    def _validate_encoding_params(
+        self,
+        vector_type: Optional[str] = None,
+        truncate_dim: Optional[int] = None,
+        text_type: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        encode_kwargs = {}
+        if text_type is not None:
+            if text_type not in PREFIX_DICT:
+                raise ValueError(
+                    f"Invalid text_type: {text_type}. Must be one of {list(PREFIX_DICT.keys())}."
+                )
+            else:
+                encode_kwargs["prefix"] = (
+                    PREFIX_DICT[text_type]
+                    if self.task != TaskType.text_matching
+                    else PREFIX_DICT["query"]
+                )
+        vector_type = vector_type or "single_vector"
+        if vector_type not in VECTOR_TYPES:
+            raise ValueError(
+                f"Invalid vector_type: {vector_type}. Must be one of {VECTOR_TYPES}."
+            )
+        else:
+            encode_kwargs["vector_type"] = vector_type
+        truncate_dim = truncate_dim or self.config.truncate_dim
+        if truncate_dim is not None and truncate_dim not in TRUNCATE_DIMS:
+            raise ValueError(
+                f"Invalid truncate_dim: {truncate_dim}. Must be one of {TRUNCATE_DIMS}."
+            )
+        else:
+            encode_kwargs["truncate_dim"] = truncate_dim
+        return encode_kwargs
     def encode_texts(
         self,
+        texts: List[str],
         max_length: int = 8192,
         batch_size: int = 8,
         vector_type: Optional[str] = None,
+        return_numpy: bool = False,
+        truncate_dim: Optional[int] = None,
+        text_type: Optional[str] = None,
     ) -> List[torch.Tensor]:
+        text_type = text_type or "query"
+        encode_kwargs = self._validate_encoding_params(
+            vector_type, truncate_dim, text_type
+        )
         processor_fn = partial(
+            self.processor.process_texts,
+            max_length=max_length,
+            prefix=encode_kwargs.pop("prefix"),
         )
+        is_single = len(texts) == 1
+        embeddings = self._process_batches(
+            data=texts,
             processor_fn=processor_fn,
+            desc="Encoding texts...",
+            return_numpy=return_numpy,
             batch_size=batch_size,
+            **encode_kwargs,
         )
+        return embeddings[0] if is_single else embeddings
     def encode_images(
         self,
+        images: List[Image.Image],
         batch_size: int = 8,
         vector_type: Optional[str] = None,
+        return_numpy: bool = False,
+        truncate_dim: Optional[int] = None,
     ) -> List[torch.Tensor]:
+        encode_kwargs = self._validate_encoding_params(vector_type, truncate_dim)
+        is_single = len(images) == 1
+        embeddings = self._process_batches(
+            data=images,
             processor_fn=self.processor.process_images,
+            desc="Encoding images...",
             batch_size=batch_size,
+            return_numpy=return_numpy,
+            **encode_kwargs,
         )
+        return embeddings[0] if is_single else embeddings
     @classmethod
     def from_pretrained(
         cls,
         if "torch_dtype" not in kwargs:
             kwargs["torch_dtype"] = "auto"
+        task_value = kwargs.pop("task", "retrieval")
+        try:
+            task = TaskType(task_value)
+        except ValueError:
+            valid_tasks = [t.value for t in TaskType]
+            raise ValueError(
+                f"Invalid task: {task_value}. Must be one of {valid_tasks}."
+            )
         base_model = super().from_pretrained(
             pretrained_model_name_or_path, *args, **kwargs
         )
             )
             adapter_dir = os.path.join(adapter_cache_path, "adapters")
         base_model.adapter_dir = adapter_dir
+        base_model.task = task
         # Create the PEFT model with the requested task adapter
         peft_model = PeftModel.from_pretrained(
+            base_model, os.path.join(adapter_dir, task.value)
         )
         # Add set_task method to the PEFT model instance
+        def set_task_method(self, task: Union[str, TaskType]):
             """
             Set the task adapter for the model.
             Args:
+                task (Union[str, TaskType]): The task name. Must be one of TaskType values or
                                                   one of ['retrieval', 'text-matching', 'code']
             """
+            if isinstance(task, str):
                 try:
+                    task = TaskType(task)
                 except ValueError:
                     valid_tasks = [t.value for t in TaskType]
                     raise ValueError(
+                        f"Invalid task: {task}. Must be one of {valid_tasks}"
                     )
+            if self.model.task != task:
+                adapter_path = os.path.join(self.adapter_dir, task.value)
+                hotswap_adapter(self, adapter_path, adapter_name="default")
+                self.model.task = task
+        def get_task_method(self):
+            """
+            Get the task adapter for the model.
+            """
+            return self.model.task.value
+        # Bind the methods to the instance
         peft_model.set_task = set_task_method.__get__(peft_model, type(peft_model))
+        peft_model.get_task = get_task_method.__get__(peft_model, type(peft_model))
         return peft_model