Model Details
- Developed by: Jian Chen
- Model type: MLLM-based encoder
- Finetuned from model: OpenGVLab/InternVL2-4B
Model Sources [optional]
- GitHub: SV-RAG
- Paper: SV-RAG: LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding
Uses
A demo script is provided in the GitHub
Alternatively, this code provides a more detailed breakdown of the computation. The colpali_engine
used is customized and is available in the GitHub.
from colpali_engine.models import ColInternvl2_4b, ColInternProcessor
class ColInternVL2Retriever(BaseRetriever):
"""Retriever class using ColInternVL2 for multimodal retrieval."""
def __init__(self, model_name="puar-playground/Col-InternVL2-4B", device="cuda" if torch.cuda.is_available() else "cpu"):
"""
Initializes the ColInternVL2 model.
Args:
model_name (str): The model identifier.
device (str): Device to run the model on ('cuda' or 'cpu').
"""
os.system('pip install transformers==4.47.1')
self.multimodel = True
self.device = device
self.model = ColInternvl2_4b.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map=device).eval()
self.processor = ColInternProcessor('OpenGVLab/InternVL2-4B')
def process_text(self, query_list: List[str], batch_size: int = 4):
"""
Processes a list of text queries into embeddings using ColPhi in batches.
Args:
query_list (List[str]): List of query texts.
batch_size (int): Number of queries processed per batch.
Returns:
torch.Tensor: Concatenated embeddings for all queries.
"""
all_embeddings = []
for i in range(0, len(query_list), batch_size):
batch_queries = query_list[i : i + batch_size]
# Convert queries to model-compatible format
batch_inputs = self.processor.process_queries(batch_queries).to(self.model.device)
with torch.no_grad():
batch_embeddings = self.model(**batch_inputs)
all_embeddings.append(batch_embeddings.to("cpu"))
# Concatenate all batch outputs into a single tensor
all_embeddings = self.pad_and_cat_tensors(all_embeddings)
return all_embeddings
@staticmethod
def pad_and_cat_tensors(tensor_list):
# Find the maximum length of the second dimension (x_i) across all tensors
max_x = max(tensor.size(1) for tensor in tensor_list)
# Pad tensors to have the same size in the second dimension
padded_tensors = []
for tensor in tensor_list:
padding_size = max_x - tensor.size(1)
# Pad with zeros on the right in the second dimension
padded_tensor = torch.nn.functional.pad(tensor, (0, 0, 0, padding_size))
padded_tensors.append(padded_tensor)
# Concatenate the padded tensors along the first dimension
result_tensor = torch.cat(padded_tensors, dim=0)
return result_tensor
def process_image(self, image_dir_list: List[str]):
"""Processes images into embeddings using ColInternVL2."""
def process_images_in_batches(processor, img_dir_list, model, batch_size=2):
all_embeddings = []
# Split img_dir_list into batches
for img_dir in img_dir_list:
img = Image.open(img_dir)
# Process the batch of images
batch_features = processor.process_images(img)
# Extract the tensor from the BatchFeature object
batch_images = {k: v.to(model.device) for k, v in batch_features.items()}
# Assuming the model expects a specific input (e.g., 'pixel_values')
embeddings = model(**batch_images)
# Move embeddings to CPU and append to the list
embeddings = embeddings.to("cpu")
all_embeddings.append(embeddings)
# Concatenate all processed batches into a single tensor
all_embeddings = self.pad_and_cat_tensors(all_embeddings)
return all_embeddings
# Forward pass
with torch.no_grad():
# image_embeddings = model(**batch_images)
image_embeddings = process_images_in_batches(self.processor, image_dir_list, self.model)
return image_embeddings
def compute_similarity(self, text_embeddings, image_embeddings):
""" Computes cosine similarity between text and image embeddings. """
scores = self.processor.score_multi_vector(text_embeddings, image_embeddings)
return scores
def retrieve(self, query_list: str, image_list: List[str]):
text_embeddings = self.process_text(query_list)
image_embeddings = self.process_image(image_list)
similarity_score = self.compute_similarity(text_embeddings, image_embeddings)
values, top_indices = torch.tensor(similarity_score).sort(descending=True)
return values, top_indices
Citation
@article{chen2024lora,
title={LoRA-Contextualizing Adaptation of Large Multimodal Models for Long Document Understanding},
author={Chen, Jian and Zhang, Ruiyi and Zhou, Yufan and Yu, Tong and Dernoncourt, Franck and Gu, Jiuxiang and Rossi, Ryan A and Chen, Changyou and Sun, Tong},
journal={arXiv preprint arXiv:2411.01106},
year={2024}
}
- Downloads last month
- 88
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no pipeline_tag.
Model tree for puar-playground/Col-InternVL2-4B
Base model
OpenGVLab/InternVL2-4B