import gradio as gr
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import torch
import os

# Load the model and processor
model_id = "google/paligemma-3b-mix-224"
HF_TOKEN = os.getenv('HF_TOKEN')
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, token=HF_TOKEN).eval()
processor = AutoProcessor.from_pretrained(model_id, token=HF_TOKEN)

def generate_caption(image, prompt="What is in this image?", max_tokens=100):
    """Generate image description"""
    if image is None:
        return "Please upload an image."
    
    # Update UI to show processing
    gr.Info("Analysis starting. This may take up to 119 seconds.")
    
    # Modify prompt to include image token
    full_prompt = "<image> " + prompt
    
    # Preprocess inputs
    model_inputs = processor(text=full_prompt, images=image, return_tensors="pt")
    input_len = model_inputs["input_ids"].shape[-1]
    
    # Generate caption
    with torch.inference_mode():
        generation = model.generate(**model_inputs, max_new_tokens=max_tokens, do_sample=False)
        generation = generation[0][input_len:]
        decoded = processor.decode(generation, skip_special_tokens=True)
    
    return decoded

# Load local example images
def load_local_images():
    """Load images from the repository"""
    image_files = ['image1.jpg', 'image2.jpg', 'image3.jpg']
    local_images = []
    for img_file in image_files:
        try:
            img_path = os.path.join('.', img_file)
            if os.path.exists(img_path):
                local_images.append(Image.open(img_path))
        except Exception as e:
            print(f"Could not load {img_file}: {e}")
    return local_images

# Prepare example images
EXAMPLE_IMAGES = load_local_images()

# Create Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# PaliGemma Image Analysis")
    
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Upload or Select Image")
            custom_prompt = gr.Textbox(label="Custom Prompt", value="What is in this image?")
            submit_btn = gr.Button("Analyze Image")
        
        with gr.Column():
            output_text = gr.Textbox(label="Image Description")
    
    # Connect components
    submit_btn.click(
        fn=generate_caption, 
        inputs=[input_image, custom_prompt], 
        outputs=output_text
    )
    
    # Add example images
    gr.Examples(
        examples=[[img, "What is in this image?"] for img in EXAMPLE_IMAGES],
        inputs=[input_image, custom_prompt],
        fn=generate_caption,
        outputs=output_text
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()