Spaces:
Runtime error
Runtime error
import gradio as gr | |
import clip | |
from model import ClipCaptionModel | |
from transformers import GPT2Tokenizer | |
import numpy as np | |
import torch | |
import PIL | |
from predict import generate2, generate_beam | |
from huggingface_hub import hf_hub_download | |
D = torch.device | |
CPU = torch.device('cpu') | |
pretrained_model_variance = "0.015" | |
device = "cpu" | |
model_path = hf_hub_download('johko/capdec_015', 'model.pt') | |
clip_model, preprocess = clip.load("RN50x4", device=device, jit=False) | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
model_0 = hf_hub_download('johko/capdec_0', 'model.pt') | |
model_001 = hf_hub_download('johko/capdec_001', 'model.pt') | |
model_005 = hf_hub_download('johko/capdec_005', 'model.pt') | |
model_015 = hf_hub_download('johko/capdec_015', 'model.pt') | |
model_025 = hf_hub_download('johko/capdec_025', 'model.pt') | |
model_05 = hf_hub_download('johko/capdec_05', 'model.pt') | |
def load_noise_level_model(noise_level): | |
if noise_level == "0.0": | |
model_path = model_0 | |
elif noise_level == "0.001": | |
model_path = model_001 | |
elif noise_level == "0.005": | |
model_path = model_005 | |
elif noise_level == "0.015": | |
model_path = model_015 | |
elif noise_level == "0.025": | |
model_path = model_025 | |
elif noise_level == "0.05": | |
model_path = model_05 | |
else: | |
raise ValueError("Unknown Noise Level") | |
model = ClipCaptionModel() | |
model.load_state_dict(torch.load(model_path, map_location=CPU)) | |
model = model.eval() | |
model = model.to(device) | |
return model | |
def infer(input_image: np.ndarray, noise_level: str): | |
use_beam_search = True | |
model = load_noise_level_model(noise_level) | |
pil_image = PIL.Image.fromarray(input_image) | |
image = preprocess(pil_image).unsqueeze(0).to(device) | |
with torch.no_grad(): | |
prefix = clip_model.encode_image(image).to(device, dtype=torch.float32) | |
prefix_embed = model.clip_project(prefix).reshape(1, 40, -1) | |
if use_beam_search: | |
generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0] | |
else: | |
generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed) | |
return input_image, generated_text_prefix | |
description="""This space is a demo for the paper [*Text-Only Training for Image Captioning using Noise-Injected CLIP*](https://arxiv.org/pdf/2211.00575.pdf) | |
by David Nukrai, Ron Mokady and Amir Globerson. | |
The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels, | |
with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose.""" | |
dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level") | |
input_image = gr.components.Image(label="Input Image") | |
output_image = gr.components.Image(label="Image") | |
output_text = gr.components.Textbox(label="Generated Caption") | |
iface = gr.Interface( | |
title="CapDec Image Captioning", | |
description=description, | |
fn=infer, | |
inputs=[input_image, dropdown], | |
outputs=[output_image, output_text], | |
examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]]) | |
iface.launch() |