Gengzigang
commited on
Commit
·
3598b4a
1
Parent(s):
3831a1e
update README.md
Browse files
README.md
CHANGED
@@ -33,6 +33,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
|
|
33 |
## Usage
|
34 |
|
35 |
### Huggingface Version
|
|
|
36 |
```python
|
37 |
from PIL import Image
|
38 |
from transformers import AutoModel
|
@@ -54,17 +55,48 @@ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cud
|
|
54 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
55 |
outputs = model.get_image_features(input_pixels)
|
56 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
```
|
61 |
-
|
62 |
-
|
63 |
-
author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
|
64 |
-
year={2024},
|
65 |
-
eprint={2411.04997},
|
66 |
-
archivePrefix={arXiv},
|
67 |
-
primaryClass={cs.CV},
|
68 |
-
url={https://arxiv.org/abs/2411.04997},
|
69 |
-
}
|
70 |
-
```
|
|
|
33 |
## Usage
|
34 |
|
35 |
### Huggingface Version
|
36 |
+
Image Embeddings
|
37 |
```python
|
38 |
from PIL import Image
|
39 |
from transformers import AutoModel
|
|
|
55 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
56 |
outputs = model.get_image_features(input_pixels)
|
57 |
```
|
58 |
+
Retrieval
|
59 |
+
```python
|
60 |
+
from PIL import Image
|
61 |
+
from transformers import AutoModel, AutoConfig, AutoTokenizer
|
62 |
+
from transformers import CLIPImageProcessor
|
63 |
+
import torch
|
64 |
+
from llm2vec import LLM2Vec
|
65 |
|
66 |
+
processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
|
67 |
+
model_name_or_path = "microsoft/LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
|
68 |
+
model = AutoModel.from_pretrained(
|
69 |
+
model_name_or_path,
|
70 |
+
torch_dtype=torch.float16,
|
71 |
+
trust_remote_code=True).to('cuda').eval()
|
72 |
+
|
73 |
+
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
|
74 |
+
config = AutoConfig.from_pretrained(
|
75 |
+
llm_model_name, trust_remote_code=True
|
76 |
+
)
|
77 |
+
llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
|
78 |
+
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
|
79 |
+
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
|
80 |
+
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
|
81 |
+
|
82 |
+
captions = ["a diagram", "a dog", "a cat"]
|
83 |
+
image_path = "CLIP.png"
|
84 |
+
|
85 |
+
image = Image.open(image_path)
|
86 |
+
input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
|
87 |
+
|
88 |
+
with torch.no_grad(), torch.cuda.amp.autocast():
|
89 |
+
image_features = model.get_image_features(input_pixels)
|
90 |
+
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
|
91 |
+
text_features = model.get_text_features(text_features)
|
92 |
+
|
93 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
94 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
95 |
+
|
96 |
+
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
97 |
+
|
98 |
+
print("Label probs:", text_probs)
|
99 |
|
100 |
```
|
101 |
+
|
102 |
+
## BibTeX & Citation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|