import gradio as gr from transformers import CLIPProcessor, CLIPModel # Load the CLIP model and processor model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") def clip_inference(input_img, input_text): # Split input_text into a list of text entries text_entries = [text.strip() for text in input_text.split(",")] # Prepare inputs for CLIP model inputs = processor(text=text_entries, images=input_img, return_tensors="pt", padding=True) # Get similarity scores outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) # Format the output probabilities as a comma-separated string output_prob = ', '.join([str(prob.item()) for prob in probs[0]]) return output_prob title = "CLIP OpenAI Model" description = "Find similarity between images and multiple text entries (separated by commas)." text_examples = ["a sky with full of stars, painting image", "a dog playing in the garden, a dog sleeping in the garden", "a small girl dancing, a small girl playing guitar", "a small family cooking in the kitchen,family watching the movie", "students inside the class,students playing in the ground ", "a traffic signal, a lot of cars", "a theatre, a football stadium", "group of animals, group of birds", "yellow sunflowers, red roses", "sunset across the lake, sky with full of stars"] examples = [["examples/images_" + str(i) + ".jpg", text] for i, text in enumerate(text_examples)] demo = gr.Interface( clip_inference, inputs=[ gr.Image(label="Input image"), gr.Textbox(placeholder="Input text : Multiple entries separated by commas"), ], outputs=[gr.Textbox(label="similarity scores")], title=title, description=description, examples=examples ) demo.launch()