Spaces:

Pdro-ruiz
/

MLLM_Estado_del_Arte_Feb25

Running

App Files Files Community

Pdro-ruiz commited on Mar 3

Commit

b9efe6a

verified ·

1 Parent(s): fc4df88

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -24

app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import gradio as gr
 import torch
 from transformers import (
     Idefics2Processor, Idefics2ForConditionalGeneration,
-    Blip2Processor, Blip2ForConditionalGeneration
 )
 from PIL import Image
 import time
@@ -26,7 +27,7 @@ models = {
         "model_id": "HuggingFaceM4/idefics2-8b",
         "processor_class": Idefics2Processor,
         "model_class": Idefics2ForConditionalGeneration,
-        "caption_prompt": "<image>Describe the image in detail"
     },
     "BLIP2": {
         "model_id": "Salesforce/blip2-opt-2.7b",
@@ -36,25 +37,32 @@ models = {
     }
 }
-# Cargar modelos (pre-cargados para evitar retrasos)
 model_instances = {}
 for model_name, config in models.items():
     processor = config["processor_class"].from_pretrained(config["model_id"])
-    model = config["model_class"].from_pretrained(config["model_id"]).to(device)
     model_instances[model_name] = (processor, model)
 # Preguntas VQA predefinidas
 vqa_questions = [
-    "Are there people in the image?",
-    "Which color predominates in the image?"
 ]
-# Referencia genérica para BLEU (puedes ajustar según necesidades)
-reference_caption = ["An image with people and various objects"]
-def infer(image, model_name, task, question=None):
     if image is None:
-        return "Por favor, sube una imagen.", None, None, None, None, None
     # Abrir y preparar la imagen
     image = Image.open(image).convert("RGB")
@@ -79,13 +87,28 @@ def infer(image, model_name, task, question=None):
         caption = processor.decode(output_ids[0], skip_special_tokens=True)
         inference_time = time.time() - start_time
-        # Calcular BLEU (simplificado, usando referencia genérica)
-        bleu_score = sentence_bleu([reference_caption[0].split()], caption.split()) if caption else 0.0
-        return (caption, inference_time, None, None, vram, bleu_score)
     elif task == "vqa" and question:
-        vqa_text = question if "BLIP2" in model_name else f"<image>Q: {question}"
         inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device)
         output_ids = model.generate(
             **inputs,
@@ -96,38 +119,53 @@ def infer(image, model_name, task, question=None):
         vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True)
         inference_time = time.time() - start_time
-        return (None, None, vqa_answer, inference_time, vram, None)
-    return "Selecciona una tarea válida y, para VQA, una pregunta.", None, None, None, None, None
 # Interfaz Gradio
-with gr.Blocks(title="MLLM Benchmark Demo") as demo:
     gr.Markdown("# Benchmark para Modelos Multimodales (MLLMs)")
-    gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de captioning o VQA.")
     with gr.Row():
         with gr.Column():
             image_input = gr.Image(type="filepath", label="Subir Imagen")
             model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2")
             task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning")
-            question_input = gr.Textbox(label="Pregunta VQA (opcional, solo para VQA)", placeholder="Ej: Are there people in the image?")
             submit_btn = gr.Button("Generar")
         with gr.Column():
             caption_output = gr.Textbox(label="Subtítulo Generado")
             vqa_output = gr.Textbox(label="Respuesta VQA")
             metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)")
     submit_btn.click(
-        fn=infer,
         inputs=[image_input, model_dropdown, task_dropdown, question_input],
-        outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="BLEU Score")]
     )
     gr.Markdown("### Notas")
     gr.Markdown("""
-    - para mejroar la velocidad de inferencia, descarga en local y usar GPU avanzada.
-    - La métrica BLEU usa una referencia genérica y puede no reflejar la calidad real.
     - Para más detalles, consulta el [repositorio del paper](https://huggingface.co/spaces/Pdro-ruiz/MLLM_Estado_del_Arte_Feb25/tree/main).
     """)

 import torch
 from transformers import (
     Idefics2Processor, Idefics2ForConditionalGeneration,
+    Blip2Processor, Blip2ForConditionalGeneration,
+    BitsAndBytesConfig
 )
 from PIL import Image
 import time
         "model_id": "HuggingFaceM4/idefics2-8b",
         "processor_class": Idefics2Processor,
         "model_class": Idefics2ForConditionalGeneration,
+        "caption_prompt": "<image>Describe la imagen con detalle"
     },
     "BLIP2": {
         "model_id": "Salesforce/blip2-opt-2.7b",
     }
 }
+# Cargar modelos con optimización (cuantización de 4 bits para IDEFICS2)
 model_instances = {}
 for model_name, config in models.items():
+    quantization_config = BitsAndBytesConfig(load_in_4bit=True) if "IDEFICS2" in model_name else None
     processor = config["processor_class"].from_pretrained(config["model_id"])
+    model = config["model_class"].from_pretrained(config["model_id"], quantization_config=quantization_config).to(device)
     model_instances[model_name] = (processor, model)
 # Preguntas VQA predefinidas
 vqa_questions = [
+    "¿Hay personas en la imagen?",
+    "¿Qué color predomina en la imagen?"
 ]
+# Referencia genérica para BLEU (puedes mejorar con captions reales de COCO)
+def load_coco_references(image_path):
+    # Placeholder: Implementa lógica para mapear image_path a captions de COCO
+    # Por ahora, usamos una referencia genérica mejorada
+    return ["Una sala de estar con muebles y una chimenea"]  # Ejemplo
+# Lista para almacenar resultados
+results = []
+def infer_and_store(image, model_name, task, question=None):
     if image is None:
+        return "Por favor, sube una imagen.", None, None, None, None, None, "Por favor, sube una imagen."
     # Abrir y preparar la imagen
     image = Image.open(image).convert("RGB")
         caption = processor.decode(output_ids[0], skip_special_tokens=True)
         inference_time = time.time() - start_time
+        # Usar una referencia más significativa para BLEU
+        reference_caption = load_coco_references(image.name if hasattr(image, "name") else image)
+        bleu_score = sentence_bleu([ref.split() for ref in reference_caption], caption.split()) if caption else 0.0
+        # Almacenar resultados
+        results.append({
+            "Imagen": image.name if hasattr(image, "name") else "desconocida",
+            "Modelo": model_name,
+            "Tarea": task,
+            "Subtítulo": caption,
+            "Tiempo Captioning (s)": inference_time,
+            "Pregunta VQA": None,
+            "Respuesta VQA": None,
+            "Tiempo VQA (s)": None,
+            "VRAM (GB)": vram,
+            "Puntuación BLEU": bleu_score
+        })
+        return (caption, inference_time, None, None, vram, bleu_score, f"Modelo: {model_name}\nTarea: Captioning\nSubtítulo: {caption}\nTiempo: {inference_time:.3f} s\nVRAM: {vram:.3f} GB\nBLEU: {bleu_score:.3f}")
     elif task == "vqa" and question:
+        vqa_text = question if "BLIP2" in model_name else f"<image>Pregunta: {question}"
         inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device)
         output_ids = model.generate(
             **inputs,
         vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True)
         inference_time = time.time() - start_time
+        # Almacenar resultados
+        results.append({
+            "Imagen": image.name if hasattr(image, "name") else "desconocida",
+            "Modelo": model_name,
+            "Tarea": task,
+            "Subtítulo": None,
+            "Tiempo Captioning (s)": None,
+            "Pregunta VQA": question,
+            "Respuesta VQA": vqa_answer,
+            "Tiempo VQA (s)": inference_time,
+            "VRAM (GB)": vram,
+            "Puntuación BLEU": None
+        })
+        return (None, None, vqa_answer, inference_time, vram, None, f"Modelo: {model_name}\nTarea: VQA\nPregunta: {question}\nRespuesta: {vqa_answer}\nTiempo: {inference_time:.3f} s\nVRAM: {vram:.3f} GB")
+    return "Selecciona una tarea válida y, para VQA, una pregunta de la lista.", None, None, None, None, None, "Selecciona una tarea válida y, para VQA, una pregunta de la lista."
 # Interfaz Gradio
+with gr.Blocks(title="Demostración de Benchmark para Modelos Multimodales (MLLMs)") as demo:
     gr.Markdown("# Benchmark para Modelos Multimodales (MLLMs)")
+    gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de subtitulado o respuesta a preguntas visuales (VQA).")
     with gr.Row():
         with gr.Column():
             image_input = gr.Image(type="filepath", label="Subir Imagen")
             model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2")
             task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning")
+            question_input = gr.Dropdown(choices=vqa_questions, label="Pregunta VQA (selecciona una)", value=vqa_questions[0])
             submit_btn = gr.Button("Generar")
         with gr.Column():
             caption_output = gr.Textbox(label="Subtítulo Generado")
             vqa_output = gr.Textbox(label="Respuesta VQA")
             metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)")
+            results_output = gr.Textbox(label="Resumen de Resultados", lines=10)
     submit_btn.click(
+        fn=infer_and_store,
         inputs=[image_input, model_dropdown, task_dropdown, question_input],
+        outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="Puntuación BLEU"), results_output]
     )
     gr.Markdown("### Notas")
     gr.Markdown("""
+    - Para mejorar la velocidad de inferencia, descarga los modelos localmente y usa una GPU avanzada.
+    - La puntuación BLEU usa una referencia genérica y puede no reflejar la calidad real. Se recomienda mejorar las referencias con datos reales (e.g., COCO 2017).
     - Para más detalles, consulta el [repositorio del paper](https://huggingface.co/spaces/Pdro-ruiz/MLLM_Estado_del_Arte_Feb25/tree/main).
     """)