nkanungo commited on
Commit
a380d32
·
1 Parent(s): f22b827

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +31 -7
  2. app.py +112 -0
  3. features.npy +3 -0
  4. photo_ids.csv +0 -0
  5. photos.tsv000 +0 -0
  6. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1,37 @@
1
  ---
2
- title: Clip
3
- emoji: 📉
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.0.2
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Image Search Using CLIP
3
+ emoji: 🏢
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
 
9
  ---
10
 
11
+ # Configuration
12
+
13
+ `title`: _string_
14
+ Display title for the Space
15
+
16
+ `emoji`: _string_
17
+ Space emoji (emoji-only character allowed)
18
+
19
+ `colorFrom`: _string_
20
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
21
+
22
+ `colorTo`: _string_
23
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
24
+
25
+ `sdk`: _string_
26
+ Can be either `gradio` or `streamlit`
27
+
28
+ `sdk_version` : _string_
29
+ Only applicable for `streamlit` SDK.
30
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
31
+
32
+ `app_file`: _string_
33
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
34
+ Path is relative to the root of the repository.
35
+
36
+ `pinned`: _boolean_
37
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Acknowledgments:
2
+ #This project is inspired by:
3
+ #1. https://github.com/haltakov/natural-language-image-search by Vladimir Haltakov
4
+ #2. OpenAI's CLIP
5
+
6
+
7
+
8
+ #Importing all the necessary libraries
9
+ import torch
10
+ import requests
11
+ import numpy as np
12
+ import pandas as pd
13
+ import gradio as gr
14
+ from io import BytesIO
15
+
16
+ from PIL import Image as PILIMAGE
17
+
18
+ from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
19
+ from sentence_transformers import SentenceTransformer, util
20
+
21
+
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ # Define model
26
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
27
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
28
+ tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
29
+
30
+ # Load data
31
+ photos = pd.read_csv("./photos.tsv000", sep='\t', header=0)
32
+ photo_features = np.load("./features.npy")
33
+ photo_ids = pd.read_csv("./photo_ids.csv")
34
+ photo_ids = list(photo_ids['photo_id'])
35
+
36
+
37
+
38
+ def encode_text(text):
39
+ with torch.no_grad():
40
+ # Encode and normalize the description using CLIP
41
+ inputs = tokenizer([text], padding=True, return_tensors="pt")
42
+ inputs = processor(text=[text], images=None, return_tensors="pt", padding=True)
43
+ text_encoded = model.get_text_features(**inputs).detach().numpy()
44
+ return text_encoded
45
+
46
+ def encode_image(image):
47
+ image = PILIMAGE.fromarray(image.astype('uint8'), 'RGB')
48
+ with torch.no_grad():
49
+ photo_preprocessed = processor(text=None, images=image, return_tensors="pt", padding=True)["pixel_values"]
50
+ search_photo_feature = model.get_image_features(photo_preprocessed.to(device))
51
+ search_photo_feature /= search_photo_feature.norm(dim=-1, keepdim=True)
52
+ image_encoded = search_photo_feature.cpu().numpy()
53
+ return image_encoded
54
+
55
+ T2I = "Text2Image"
56
+ I2I = "Image2Image"
57
+
58
+ def similarity(feature, photo_features):
59
+ similarities = list((feature @ photo_features.T).squeeze(0))
60
+ return similarities
61
+
62
+ def find_best_matches(image, mode, text):
63
+ # Compute the similarity between the descrption and each photo using the Cosine similarity
64
+ print ("Mode now ",mode)
65
+
66
+ if mode == "Text2Image":
67
+ # Encode the text input
68
+ text_features = encode_text(text)
69
+ feature = text_features
70
+ similarities = similarity(text_features, photo_features)
71
+
72
+
73
+ else:
74
+ #Encode the image input
75
+ image_features = encode_image(image)
76
+ feature = image_features
77
+ similarities = similarity(image_features, photo_features)
78
+
79
+ # Sort the photos by their similarity score
80
+ best_photos = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)
81
+
82
+ matched_images = []
83
+ for i in range(3):
84
+ # Retrieve the photo ID
85
+ idx = best_photos[i][1]
86
+ photo_id = photo_ids[idx]
87
+
88
+ # Get all metadata for this photo
89
+ photo_data = photos[photos["photo_id"] == photo_id].iloc[0]
90
+
91
+ # Display the images
92
+ #display(Image(url=photo_data["photo_image_url"] + "?w=640"))
93
+ response = requests.get(photo_data["photo_image_url"] + "?w=640")
94
+ img = PILIMAGE.open(BytesIO(response.content))
95
+ matched_images.append(img)
96
+ return matched_images
97
+
98
+
99
+
100
+
101
+ gr.Interface(fn=find_best_matches,
102
+ inputs=[
103
+ gr.Image(label="Image to search", optional=True),
104
+ gr.Radio([T2I, I2I]),
105
+ gr.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",
106
+ )],
107
+ theme="grass",
108
+ outputs=[gr.Gallery(
109
+ label="Generated images", show_label=False, elem_id="gallery"
110
+ ).style(grid=[2], height="auto")], enable_queue=True, title="CLIP Image Search",
111
+ description="This application displays TOP THREE images from Unsplash dataset that best match the search query provided by the user. Moreover, the input can be provided via two modes ie text or image form.").launch()
112
+
features.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ac381e52fa007821a642b5808ac9a6eaf7163322ab340d36bcc3c2a94a38c8
3
+ size 25596032
photo_ids.csv ADDED
The diff for this file is too large to render. See raw diff
 
photos.tsv000 ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence-transformers==2.1.0
2
+ transformers
3
+ torch
4
+ numpy
5
+ ftfy