Gengzigang commited on
Commit
3598b4a
·
1 Parent(s): 3831a1e

update README.md

Browse files
Files changed (1) hide show
  1. README.md +43 -11
README.md CHANGED
@@ -33,6 +33,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
33
  ## Usage
34
 
35
  ### Huggingface Version
 
36
  ```python
37
  from PIL import Image
38
  from transformers import AutoModel
@@ -54,17 +55,48 @@ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cud
54
  with torch.no_grad(), torch.cuda.amp.autocast():
55
  outputs = model.get_image_features(input_pixels)
56
  ```
 
 
 
 
 
 
 
57
 
58
- ## BibTeX & Citation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  ```
61
- @misc{huang2024llm2clippowerfullanguagemodel,
62
- title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
63
- author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
64
- year={2024},
65
- eprint={2411.04997},
66
- archivePrefix={arXiv},
67
- primaryClass={cs.CV},
68
- url={https://arxiv.org/abs/2411.04997},
69
- }
70
- ```
 
33
  ## Usage
34
 
35
  ### Huggingface Version
36
+ Image Embeddings
37
  ```python
38
  from PIL import Image
39
  from transformers import AutoModel
 
55
  with torch.no_grad(), torch.cuda.amp.autocast():
56
  outputs = model.get_image_features(input_pixels)
57
  ```
58
+ Retrieval
59
+ ```python
60
+ from PIL import Image
61
+ from transformers import AutoModel, AutoConfig, AutoTokenizer
62
+ from transformers import CLIPImageProcessor
63
+ import torch
64
+ from llm2vec import LLM2Vec
65
 
66
+ processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
67
+ model_name_or_path = "microsoft/LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
68
+ model = AutoModel.from_pretrained(
69
+ model_name_or_path,
70
+ torch_dtype=torch.float16,
71
+ trust_remote_code=True).to('cuda').eval()
72
+
73
+ llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
74
+ config = AutoConfig.from_pretrained(
75
+ llm_model_name, trust_remote_code=True
76
+ )
77
+ llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
78
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
79
+ llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
80
+ l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
81
+
82
+ captions = ["a diagram", "a dog", "a cat"]
83
+ image_path = "CLIP.png"
84
+
85
+ image = Image.open(image_path)
86
+ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
87
+
88
+ with torch.no_grad(), torch.cuda.amp.autocast():
89
+ image_features = model.get_image_features(input_pixels)
90
+ text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
91
+ text_features = model.get_text_features(text_features)
92
+
93
+ image_features /= image_features.norm(dim=-1, keepdim=True)
94
+ text_features /= text_features.norm(dim=-1, keepdim=True)
95
+
96
+ text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
97
+
98
+ print("Label probs:", text_probs)
99
 
100
  ```
101
+
102
+ ## BibTeX & Citation