import gradio as gr from transformers import AutoProcessor, PaliGemmaForConditionalGeneration from PIL import Image import torch import requests # Load the model and processor model_id = "google/paligemma-3b-mix-224" model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, token=True).eval() processor = AutoProcessor.from_pretrained(model_id, token=True) # Supported languages and example prompts LANGUAGES = { "English": "caption en", "Spanish": "caption es", "French": "caption fr", "German": "caption de" } def generate_caption(image, language, max_tokens=100): """Generate image caption in specified language""" if image is None: return "Please upload an image." prompt = LANGUAGES.get(language, "caption en") # Preprocess inputs model_inputs = processor(text=prompt, images=image, return_tensors="pt") input_len = model_inputs["input_ids"].shape[-1] # Generate caption with torch.inference_mode(): generation = model.generate(**model_inputs, max_new_tokens=max_tokens, do_sample=False) generation = generation[0][input_len:] decoded = processor.decode(generation, skip_special_tokens=True) return decoded def load_example_image(url): """Load example image from URL""" return Image.open(requests.get(url, stream=True).raw) # Prepare example images EXAMPLE_IMAGES = [ load_example_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"), load_example_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/food.jpg"), load_example_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/city.jpg") ] # Create Gradio Interface with gr.Blocks() as demo: gr.Markdown("# PaliGemma Image Captioning") gr.Markdown("Upload an image and get a caption in your preferred language!") with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Upload Image") language_dropdown = gr.Dropdown( list(LANGUAGES.keys()), value="English", label="Caption Language" ) submit_btn = gr.Button("Generate Caption") with gr.Column(): output_text = gr.Textbox(label="Generated Caption") # Connect components submit_btn.click( fn=generate_caption, inputs=[input_image, language_dropdown], outputs=output_text ) # Add example images gr.Examples( examples=[[img, lang] for img in EXAMPLE_IMAGES for lang in LANGUAGES.keys()], inputs=[input_image, language_dropdown], fn=generate_caption, outputs=output_text ) # Launch the app if __name__ == "__main__": demo.launch()