Spaces:
Runtime error
Runtime error
File size: 7,098 Bytes
b85aa9e c0e0a13 b85aa9e c0e0a13 b85aa9e c0e0a13 b85aa9e c0e0a13 3842618 c0e0a13 b85aa9e c0e0a13 b85aa9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import os
import clip
import torch
from PIL import Image
# import sys
# from pptx import Presentation
# from pptx.enum.shapes import MSO_SHAPE_TYPE
# import time
class ClipImage:
def __init__(self, path_of_ppt_folders, path_to_save_image_features, mode='image', device='cuda'):
"""
:param input_image_path: path of the input image (mode = 'image') or the actual text to be searched (mode='text')
:param path_of_ppt_folders: path of the folder containing all the ppt folders
:param path_to_save_image_features: path to save the image features
:param mode: 'image' or 'text' based on the type of input
:param device: device to run the model on
"""
print("HEADS UPP -- ALWAYS using CPU for this 'spaces' version of the project. Otherwise we get FP32/16 conflicts.")
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
# Path
directory = 'input_features'
path = os.path.join(path_to_save_image_features, directory)
if not os.path.exists(path):
# Create the directory
os.mkdir(path)
print("Directory '% s' created" % directory)
self.res = []
if not os.path.isdir(path_of_ppt_folders):
raise TypeError(f"{path_of_ppt_folders} is not a directory. Please only enter a directory")
# if mode == 'image' and not os.path.exists(input_image_path):
# raise FileNotFoundError(f"{input_image_path} does not exist.")
if not os.path.exists(path_to_save_image_features) or not os.path.isdir(path_to_save_image_features):
raise FileNotFoundError(f"{path_to_save_image_features} is not a directory or doesn't exist.")
self.mode = mode
self.path_of_ppt_folders = path_of_ppt_folders
self.path_to_save_image_features = path_to_save_image_features
self.device = device
# consider ViT-L/14 should be the best one
self.model, self.preprocess = clip.load('ViT-B/32', self.device)
#print("π RUNNING CLIP'S ONE-TIME ENCODING STEP... will be slow the first time, and hopefully only the first time.")
# passing in an image as a cheap hack, to make one funciton work for initial embedding.
#self.calculate_similarity('/home/rsalvi/chatbotai/rohan/ai-teaching-assistant-uiuc/lecture_slides/001/Slide1.jpeg')
#print("π₯ DONE with CLIP's ONE TIME ENCODING")
def text_to_image_search(self, search_text: str, top_k_to_return: int = 4):
""" Written after the fact by kastan, so that we don't have to call init every time. """
assert type(search_text) == str, f"Must provide a single string, instead I got type {type(search_text)}"
# self.create_input_features(search_text, mode='text')
self.mode = 'text'
return self.calculate_similarity(search_text, top_k_to_return)
# TODO: WIP.
def image_to_images_search(self, input_image, top_k_to_return: int = 4):
""" Written after the fact by kastan, so that we don't have to call init every time. """
self.mode = 'image'
return self.calculate_similarity(input_image, top_k_to_return)
def create_input_features(self, input_text_or_img):
if self.mode == 'image':
# Load the image
#input_image = Image.open(input_text_or_img) # Not needed as image comes from gradio in PIL format
# Preprocess the image
input_arr = torch.cat([self.preprocess(input_text_or_img).unsqueeze(0)]).to(self.device)
elif self.mode == 'text':
# Preprocess the text
input_arr = torch.cat([clip.tokenize(f"{input_text_or_img}")]).to(self.device)
# Encode the image or text
with torch.no_grad():
if self.mode == 'image':
input_features = self.model.encode_image(input_arr)
elif self.mode == 'text':
input_features = self.model.encode_text(input_arr)
input_features /= input_features.norm(dim=-1, keepdim=True)
return input_features
def new_most_similar_slide_file(self, top_k: int):
# Sort the results
ans = sorted(self.res, key=lambda x: x[2], reverse=True)
return ans[:top_k]
def calculate_similarity(self, input_text_or_img, topk_val: int = 4):
## Similarities across folders
self.res = []
all_similarities = []
slide_numbers = []
# Create the input features
input_features = self.create_input_features(input_text_or_img)
# Iterate through all the folders
ppts = list(os.listdir(self.path_of_ppt_folders))
#start_time = time.monotonic()
for i in ppts:
# Get the path of the folder containing the ppt images
imgs = list(os.listdir(os.path.join(self.path_of_ppt_folders, i)))
slide_numbers.append(imgs)
# Iterate through all the images and preprocess them
# Check if the preprocessed file exists and load it
img_flag = os.path.exists(self.path_to_save_image_features + '/input_features' + "/slides_" + i + "_tensor.pt")
if img_flag:
image_features = torch.load(self.path_to_save_image_features + '/input_features' + "/slides_" + i + "_tensor.pt",
map_location=self.device)
else:
# Encode the images and save the encoding
with torch.no_grad():
image_input = torch.cat([
self.preprocess(Image.open(os.path.join(self.path_of_ppt_folders, i, image))).unsqueeze(0) for image in imgs
]).to(self.device)
image_features = self.model.encode_image(image_input)
image_features /= image_features.norm(dim=-1, keepdim=True)
torch.save(image_features, self.path_to_save_image_features + '/input_features' + "/slides_" + i + "_tensor.pt")
print("Saved the image features (for faster future loading) to: ", self.path_to_save_image_features + "/slides_" + i + "_tensor.pt")
# Calculate the similarity between the input image and the images in the folder
# TODO: THIS REQUIRES REFACTOR. We're only looking in a SINGLE FOLDER. need to APPEND to similarity.
if self.mode == 'image':
similarity = (100.0 * input_features @ image_features.T).softmax(dim=-1)
all_similarities.append((i, similarity))
elif self.mode == 'text':
similarity = (100.0 * input_features @ image_features.T).softmax(dim=-1)
all_similarities.append((i, similarity))
## Looking over all the folders
similarity_results = []
for j in range(0, len(all_similarities)):
folder_name = all_similarities[j][0]
folder_values = all_similarities[j][1][0]
for i in range(0, len(folder_values)):
self.res.append((folder_name, slide_numbers[j][i], folder_values[i]))
#print(self.res)
return self.new_most_similar_slide_file(topk_val)
# Return the sorted results
# if __name__ == "__main__":
# demo = ClipImage('/home/rsalvi/chatbotai/rohan/ai-teaching-assistant-uiuc/lecture_slides','/home/rsalvi/chatbotai/rohan/ai-teaching-assistant-uiuc')
# #op = demo.image_to_images_search('/home/rsalvi/chatbotai/rohan/ai-teaching-assistant-uiuc/lecture_slides/01c/Slide5.jpeg')
# op = demo.text_to_image_search("Unsigned Bit Pattern")
# print(op)
# op = demo.text_to_image_search("Graycode")
# print(op) |