Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,901 Bytes
c91d9f3 c580f5e b9c7982 c91d9f3 b9c7982 c91d9f3 9174c87 1de48dc c580f5e 1de48dc b9c7982 c91d9f3 dcd8e07 45113e4 c91d9f3 dcd8e07 b9c7982 dcd8e07 b9c7982 45113e4 1c21246 dcd8e07 45113e4 dcd8e07 178fb4b c91d9f3 dcd8e07 c51582e dcd8e07 c91d9f3 dcd8e07 c91d9f3 33262af b9c7982 c51582e a012928 178fb4b 33262af b9c7982 61104d4 c91d9f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
from transformers import (
PaliGemmaProcessor,
PaliGemmaForConditionalGeneration,
)
from transformers.image_utils import load_image
import torch
import os
import spaces # Import the spaces module
import requests
from io import BytesIO
from PIL import Image
def load_model():
"""Load PaliGemma2 model and processor with Hugging Face token."""
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Retrieve token from environment variable
if not token:
raise ValueError(
"Hugging Face API token not found. Please set it in the environment variables."
)
# Load the processor and model using the correct identifier
model_id = "google/paligemma2-10b-pt-448"
processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = PaliGemmaForConditionalGeneration.from_pretrained(
model_id, torch_dtype=torch.bfloat16, use_auth_token=token
).to(device).eval()
return processor, model
@spaces.GPU(duration=120) # Increased timeout to 120 seconds
def process_image_and_text(image_pil, num_beams, temperature, seed):
"""Extract text from image using PaliGemma2."""
try:
processor, model = load_model()
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the image using load_image
image = load_image(image_pil)
# Add <image> token to the beginning of the text prompt
text_input = " "
# Use the provided text input
model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
device, dtype=torch.bfloat16
)
input_len = model_inputs["input_ids"].shape[-1]
torch.manual_seed(seed) # Set random seed for reproducibility
with torch.inference_mode():
generation = model.generate(**model_inputs, max_new_tokens=200, do_sample=True, num_beams=num_beams, temperature=temperature)
generation = generation[0][input_len:]
decoded = processor.decode(generation, skip_special_tokens=True)
return decoded
except Exception as e:
print(f"Error during GPU task: {e}")
raise gr.Error(f"GPU task failed: {e}")
if __name__ == "__main__":
iface = gr.Interface(
fn=process_image_and_text,
inputs=[
gr.Image(type="pil", label="Upload an image"),
gr.Slider(minimum=1, maximum=10, step=1, value=10, label="Number of Beams"),
gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
gr.Number(label="Random Seed", value=0, precision=0),
],
outputs=gr.Textbox(label="Generated Text"),
title="PaliGemma2 Image to Text",
description="Upload an image and the model will generate text.",
)
iface.launch() |