|
import gradio as gr
|
|
import torch
|
|
from transformers import (
|
|
Idefics2Processor, Idefics2ForConditionalGeneration,
|
|
Blip2Processor, Blip2ForConditionalGeneration
|
|
)
|
|
from PIL import Image
|
|
import time
|
|
import pandas as pd
|
|
import nltk
|
|
from nltk.translate.bleu_score import sentence_bleu
|
|
|
|
|
|
try:
|
|
nltk.data.find("tokenizers/punkt")
|
|
except LookupError:
|
|
nltk.download("punkt")
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
print(f"Usando dispositivo: {device}")
|
|
|
|
|
|
models = {
|
|
"IDEFICS2": {
|
|
"model_id": "HuggingFaceM4/idefics2-8b",
|
|
"processor_class": Idefics2Processor,
|
|
"model_class": Idefics2ForConditionalGeneration,
|
|
"caption_prompt": "<image>Describe the image in detail"
|
|
},
|
|
"BLIP2": {
|
|
"model_id": "Salesforce/blip2-opt-2.7b",
|
|
"processor_class": Blip2Processor,
|
|
"model_class": Blip2ForConditionalGeneration,
|
|
"caption_prompt": ""
|
|
}
|
|
}
|
|
|
|
|
|
model_instances = {}
|
|
for model_name, config in models.items():
|
|
processor = config["processor_class"].from_pretrained(config["model_id"])
|
|
model = config["model_class"].from_pretrained(config["model_id"]).to(device)
|
|
model_instances[model_name] = (processor, model)
|
|
|
|
|
|
vqa_questions = [
|
|
"Are there people in the image?",
|
|
"Which color predominates in the image?"
|
|
]
|
|
|
|
|
|
reference_caption = ["An image with people and various objects"]
|
|
|
|
def infer(image, model_name, task, question=None):
|
|
if image is None:
|
|
return "Por favor, sube una imagen.", None, None, None, None, None
|
|
|
|
|
|
image = Image.open(image).convert("RGB")
|
|
if "BLIP2" in model_name:
|
|
image = image.resize((224, 224))
|
|
|
|
processor, model = model_instances[model_name]
|
|
|
|
start_time = time.time()
|
|
vram = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
|
|
|
|
if task == "captioning":
|
|
caption_prompt = models[model_name]["caption_prompt"]
|
|
caption_text = "" if "BLIP2" in model_name else caption_prompt
|
|
inputs = processor(images=image, text=caption_text, return_tensors="pt").to(device)
|
|
output_ids = model.generate(
|
|
**inputs,
|
|
max_new_tokens=50,
|
|
num_beams=5 if "BLIP2" in model_name else 1,
|
|
no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
|
|
)
|
|
caption = processor.decode(output_ids[0], skip_special_tokens=True)
|
|
inference_time = time.time() - start_time
|
|
|
|
|
|
bleu_score = sentence_bleu([reference_caption[0].split()], caption.split()) if caption else 0.0
|
|
|
|
return (caption, inference_time, None, None, vram, bleu_score)
|
|
|
|
elif task == "vqa" and question:
|
|
vqa_text = question if "BLIP2" in model_name else f"<image>Q: {question}"
|
|
inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device)
|
|
output_ids = model.generate(
|
|
**inputs,
|
|
max_new_tokens=10,
|
|
num_beams=5 if "BLIP2" in model_name else 1,
|
|
no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
|
|
)
|
|
vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True)
|
|
inference_time = time.time() - start_time
|
|
|
|
return (None, None, vqa_answer, inference_time, vram, None)
|
|
|
|
return "Selecciona una tarea válida y, para VQA, una pregunta.", None, None, None, None, None
|
|
|
|
|
|
with gr.Blocks(title="MLLM Benchmark Demo") as demo:
|
|
gr.Markdown("# Demostración de Benchmark para Modelos Multimodales (MLLMs)")
|
|
gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de captioning o VQA.")
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
image_input = gr.Image(type="filepath", label="Subir Imagen")
|
|
model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2")
|
|
task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning")
|
|
question_input = gr.Textbox(label="Pregunta VQA (opcional, solo para VQA)", placeholder="Ej: Are there people in the image?")
|
|
submit_btn = gr.Button("Generar")
|
|
|
|
with gr.Column():
|
|
caption_output = gr.Textbox(label="Subtítulo Generado")
|
|
vqa_output = gr.Textbox(label="Respuesta VQA")
|
|
metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)")
|
|
|
|
submit_btn.click(
|
|
fn=infer,
|
|
inputs=[image_input, model_dropdown, task_dropdown, question_input],
|
|
outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="BLEU Score")]
|
|
)
|
|
|
|
gr.Markdown("### Notas")
|
|
gr.Markdown("""
|
|
- Este demo usa un subconjunto de imágenes de COCO 2017. Descarga las imágenes desde [cocodataset.org](http://cocodataset.org/#download) si deseas usar tus propias imágenes.
|
|
- Requiere GPU para mejores resultados, pero funciona en CPU con tiempos más largos.
|
|
- La métrica BLEU usa una referencia genérica y puede no reflejar la calidad real.
|
|
- Para más detalles, consulta el [repositorio del paper](https://huggingface.co/datasets/pdro-ruiz/multimodal-llms-paper-2025).
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch() |