File size: 2,901 Bytes
c91d9f3
c580f5e
 
 
 
b9c7982
c91d9f3
 
 
b9c7982
 
 
c91d9f3
 
 
 
 
 
 
 
 
 
 
 
 
9174c87
1de48dc
c580f5e
 
1de48dc
b9c7982
c91d9f3
 
 
 
dcd8e07
45113e4
c91d9f3
dcd8e07
 
 
b9c7982
dcd8e07
 
b9c7982
45113e4
 
1c21246
dcd8e07
45113e4
dcd8e07
 
 
178fb4b
 
c91d9f3
dcd8e07
c51582e
dcd8e07
 
c91d9f3
dcd8e07
 
 
 
c91d9f3
 
 
 
33262af
 
b9c7982
c51582e
a012928
178fb4b
33262af
b9c7982
61104d4
 
c91d9f3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from transformers import (
    PaliGemmaProcessor,
    PaliGemmaForConditionalGeneration,
)
from transformers.image_utils import load_image
import torch
import os
import spaces  # Import the spaces module
import requests
from io import BytesIO
from PIL import Image


def load_model():
    """Load PaliGemma2 model and processor with Hugging Face token."""

    token = os.getenv("HUGGINGFACEHUB_API_TOKEN")  # Retrieve token from environment variable

    if not token:
        raise ValueError(
            "Hugging Face API token not found. Please set it in the environment variables."
        )

    # Load the processor and model using the correct identifier
    model_id = "google/paligemma2-10b-pt-448"
    processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id, torch_dtype=torch.bfloat16, use_auth_token=token
    ).to(device).eval()

    return processor, model


@spaces.GPU(duration=120)  # Increased timeout to 120 seconds
def process_image_and_text(image_pil, num_beams, temperature, seed):
    """Extract text from image using PaliGemma2."""
    try:
        processor, model = load_model()
        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load the image using load_image
        image = load_image(image_pil)

        # Add <image> token to the beginning of the text prompt
        text_input = " "

        # Use the provided text input
        model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
            device, dtype=torch.bfloat16
        )
        input_len = model_inputs["input_ids"].shape[-1]
        
        torch.manual_seed(seed) # Set random seed for reproducibility

        with torch.inference_mode():
            generation = model.generate(**model_inputs, max_new_tokens=200, do_sample=True, num_beams=num_beams, temperature=temperature)
            generation = generation[0][input_len:]
            decoded = processor.decode(generation, skip_special_tokens=True)

        return decoded
    except Exception as e:
        print(f"Error during GPU task: {e}")
        raise gr.Error(f"GPU task failed: {e}")


if __name__ == "__main__":
    iface = gr.Interface(
        fn=process_image_and_text,
        inputs=[
            gr.Image(type="pil", label="Upload an image"),
            gr.Slider(minimum=1, maximum=10, step=1, value=10, label="Number of Beams"),
            gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
            gr.Number(label="Random Seed", value=0, precision=0),
        ],
        outputs=gr.Textbox(label="Generated Text"),
        title="PaliGemma2 Image to Text",
        description="Upload an image and the model will generate text.",
    )
    iface.launch()