microsoft
/

LLM2CLIP-Openai-L-14-336

Zero-Shot Classification

Model card Files Files and versions

Gengzigang commited on Nov 18, 2024

Commit

3598b4a

·

1 Parent(s): 3831a1e

update README.md

Files changed (1) hide show

README.md +43 -11

README.md CHANGED Viewed

@@ -33,6 +33,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
 ## Usage
 ### Huggingface Version
 ```python
 from PIL import Image
 from transformers import AutoModel
@@ -54,17 +55,48 @@ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cud
 with torch.no_grad(), torch.cuda.amp.autocast():
     outputs = model.get_image_features(input_pixels)
 ```
-## BibTeX & Citation
 ```
-@misc{huang2024llm2clippowerfullanguagemodel,
-      title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
-      author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
-      year={2024},
-      eprint={2411.04997},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2411.04997},
-}
-```

 ## Usage
 ### Huggingface Version
+Image Embeddings
 ```python
 from PIL import Image
 from transformers import AutoModel
 with torch.no_grad(), torch.cuda.amp.autocast():
     outputs = model.get_image_features(input_pixels)
 ```
+Retrieval
+```python
+from PIL import Image
+from transformers import AutoModel, AutoConfig, AutoTokenizer
+from transformers import CLIPImageProcessor
+import torch
+from llm2vec import LLM2Vec
+processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
+model_name_or_path = "microsoft/LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
+model = AutoModel.from_pretrained(
+    model_name_or_path,
+    torch_dtype=torch.float16,
+    trust_remote_code=True).to('cuda').eval()
+llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
+config = AutoConfig.from_pretrained(
+    llm_model_name, trust_remote_code=True
+)
+llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
+llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' #  Workaround for LLM2VEC
+l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
+captions = ["a diagram", "a dog", "a cat"]
+image_path = "CLIP.png"
+image = Image.open(image_path)
+input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
+with torch.no_grad(), torch.cuda.amp.autocast():
+    image_features = model.get_image_features(input_pixels)
+    text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
+    text_features = model.get_text_features(text_features)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)
 ```
+## BibTeX & Citation