Spaces:

Pdro-ruiz
/

MLLM_Estado_del_Arte_Feb25

Running

File size: 5,454 Bytes

fc4df88
 
 
 
f08cdaf
fc4df88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f08cdaf
fc4df88
 
 
 
 
 
 
 
 
f08cdaf
fc4df88
 
 
f08cdaf
fc4df88
 
 
 
f08cdaf
 
fc4df88
 
f08cdaf
 
fc4df88
f08cdaf
fc4df88
f08cdaf
fc4df88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f08cdaf
 
 
 
fc4df88
 
f08cdaf
fc4df88
 
 
 
 
 
 
 
 
 
f08cdaf
fc4df88
f08cdaf
fc4df88
 
f08cdaf
fc4df88
f08cdaf
fc4df88
 
 
 
 
 
f08cdaf
fc4df88
 
 
 
 
 
 
 
f08cdaf
fc4df88
f08cdaf
fc4df88
 
 
 
f08cdaf
 
fc4df88
 
 
 
75f50ff

import gradio as gr
import torch
from transformers import (
    Idefics2Processor, Idefics2ForConditionalGeneration,
    Blip2Processor, Blip2ForConditionalGeneration
)
from PIL import Image
import time
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Descargar 'punkt' si no está disponible
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# Configuración del dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

# Definición de modelos
models = {
    "IDEFICS2": {
        "model_id": "HuggingFaceM4/idefics2-8b",
        "processor_class": Idefics2Processor,
        "model_class": Idefics2ForConditionalGeneration,
        "caption_prompt": "<image>Describe the image in detail"
    },
    "BLIP2": {
        "model_id": "Salesforce/blip2-opt-2.7b",
        "processor_class": Blip2Processor,
        "model_class": Blip2ForConditionalGeneration,
        "caption_prompt": ""  # Prompt vacío para BLIP2
    }
}

# Cargar modelos (pre-cargados para evitar retrasos)
model_instances = {}
for model_name, config in models.items():
    processor = config["processor_class"].from_pretrained(config["model_id"])
    model = config["model_class"].from_pretrained(config["model_id"]).to(device)
    model_instances[model_name] = (processor, model)

# Preguntas VQA predefinidas
vqa_questions = [
    "Are there people in the image?",
    "Which color predominates in the image?"
]

# Referencia genérica para BLEU (puedes ajustar según necesidades)
reference_caption = ["An image with people and various objects"]

def infer(image, model_name, task, question=None):
    if image is None:
        return "Por favor, sube una imagen.", None, None, None, None, None

    # Abrir y preparar la imagen
    image = Image.open(image).convert("RGB")
    if "BLIP2" in model_name:
        image = image.resize((224, 224))

    processor, model = model_instances[model_name]
    
    start_time = time.time()
    vram = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0

    if task == "captioning":
        caption_prompt = models[model_name]["caption_prompt"]
        caption_text = "" if "BLIP2" in model_name else caption_prompt
        inputs = processor(images=image, text=caption_text, return_tensors="pt").to(device)
        output_ids = model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=5 if "BLIP2" in model_name else 1,
            no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
        )
        caption = processor.decode(output_ids[0], skip_special_tokens=True)
        inference_time = time.time() - start_time

        # Calcular BLEU (simplificado, usando referencia genérica)
        bleu_score = sentence_bleu([reference_caption[0].split()], caption.split()) if caption else 0.0

        return (caption, inference_time, None, None, vram, bleu_score)

    elif task == "vqa" and question:
        vqa_text = question if "BLIP2" in model_name else f"<image>Q: {question}"
        inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device)
        output_ids = model.generate(
            **inputs,
            max_new_tokens=10,
            num_beams=5 if "BLIP2" in model_name else 1,
            no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
        )
        vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True)
        inference_time = time.time() - start_time

        return (None, None, vqa_answer, inference_time, vram, None)

    return "Selecciona una tarea válida y, para VQA, una pregunta.", None, None, None, None, None

# Interfaz Gradio
with gr.Blocks(title="MLLM Benchmark Demo") as demo:
    gr.Markdown("# Benchmark para Modelos Multimodales (MLLMs)")
    gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de captioning o VQA.")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="filepath", label="Subir Imagen")
            model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2")
            task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning")
            question_input = gr.Textbox(label="Pregunta VQA (opcional, solo para VQA)", placeholder="Ej: Are there people in the image?")
            submit_btn = gr.Button("Generar")

        with gr.Column():
            caption_output = gr.Textbox(label="Subtítulo Generado")
            vqa_output = gr.Textbox(label="Respuesta VQA")
            metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)")

    submit_btn.click(
        fn=infer,
        inputs=[image_input, model_dropdown, task_dropdown, question_input],
        outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="BLEU Score")]
    )

    gr.Markdown("### Notas")
    gr.Markdown("""
    - para mejroar la velocidad de inferencia, descarga en local y usar GPU avanzada.
    - La métrica BLEU usa una referencia genérica y puede no reflejar la calidad real.
    - Para más detalles, consulta el [repositorio del paper](https://huggingface.co/spaces/Pdro-ruiz/MLLM_Estado_del_Arte_Feb25/tree/main).
    """)

if __name__ == "__main__":
    demo.launch()