Pdro-ruiz commited on
Commit
fc4df88
·
verified ·
1 Parent(s): 9b34322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -135
app.py CHANGED
@@ -1,136 +1,135 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import (
4
- Idefics2Processor, Idefics2ForConditionalGeneration,
5
- Blip2Processor, Blip2ForConditionalGeneration
6
- )
7
- from PIL import Image
8
- import time
9
- import pandas as pd
10
- import nltk
11
- from nltk.translate.bleu_score import sentence_bleu
12
-
13
- # Descargar 'punkt' si no está disponible
14
- try:
15
- nltk.data.find("tokenizers/punkt")
16
- except LookupError:
17
- nltk.download("punkt")
18
-
19
- # Configuración del dispositivo
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
21
- print(f"Usando dispositivo: {device}")
22
-
23
- # Definición de modelos
24
- models = {
25
- "IDEFICS2": {
26
- "model_id": "HuggingFaceM4/idefics2-8b",
27
- "processor_class": Idefics2Processor,
28
- "model_class": Idefics2ForConditionalGeneration,
29
- "caption_prompt": "<image>Describe the image in detail"
30
- },
31
- "BLIP2": {
32
- "model_id": "Salesforce/blip2-opt-2.7b",
33
- "processor_class": Blip2Processor,
34
- "model_class": Blip2ForConditionalGeneration,
35
- "caption_prompt": "" # Prompt vacío para BLIP2
36
- }
37
- }
38
-
39
- # Cargar modelos (pre-cargados para evitar retrasos)
40
- model_instances = {}
41
- for model_name, config in models.items():
42
- processor = config["processor_class"].from_pretrained(config["model_id"])
43
- model = config["model_class"].from_pretrained(config["model_id"]).to(device)
44
- model_instances[model_name] = (processor, model)
45
-
46
- # Preguntas VQA predefinidas
47
- vqa_questions = [
48
- "Are there people in the image?",
49
- "Which color predominates in the image?"
50
- ]
51
-
52
- # Referencia genérica para BLEU (puedes ajustar según necesidades)
53
- reference_caption = ["An image with people and various objects"]
54
-
55
- def infer(image, model_name, task, question=None):
56
- if image is None:
57
- return "Por favor, sube una imagen.", None, None, None, None, None
58
-
59
- # Abrir y preparar la imagen
60
- image = Image.open(image).convert("RGB")
61
- if "BLIP2" in model_name:
62
- image = image.resize((224, 224))
63
-
64
- processor, model = model_instances[model_name]
65
-
66
- start_time = time.time()
67
- vram = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
68
-
69
- if task == "captioning":
70
- caption_prompt = models[model_name]["caption_prompt"]
71
- caption_text = "" if "BLIP2" in model_name else caption_prompt
72
- inputs = processor(images=image, text=caption_text, return_tensors="pt").to(device)
73
- output_ids = model.generate(
74
- **inputs,
75
- max_new_tokens=50,
76
- num_beams=5 if "BLIP2" in model_name else 1,
77
- no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
78
- )
79
- caption = processor.decode(output_ids[0], skip_special_tokens=True)
80
- inference_time = time.time() - start_time
81
-
82
- # Calcular BLEU (simplificado, usando referencia genérica)
83
- bleu_score = sentence_bleu([reference_caption[0].split()], caption.split()) if caption else 0.0
84
-
85
- return (caption, inference_time, None, None, vram, bleu_score)
86
-
87
- elif task == "vqa" and question:
88
- vqa_text = question if "BLIP2" in model_name else f"<image>Q: {question}"
89
- inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device)
90
- output_ids = model.generate(
91
- **inputs,
92
- max_new_tokens=10,
93
- num_beams=5 if "BLIP2" in model_name else 1,
94
- no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
95
- )
96
- vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True)
97
- inference_time = time.time() - start_time
98
-
99
- return (None, None, vqa_answer, inference_time, vram, None)
100
-
101
- return "Selecciona una tarea válida y, para VQA, una pregunta.", None, None, None, None, None
102
-
103
- # Interfaz Gradio
104
- with gr.Blocks(title="MLLM Benchmark Demo") as demo:
105
- gr.Markdown("# Demostración de Benchmark para Modelos Multimodales (MLLMs)")
106
- gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de captioning o VQA.")
107
-
108
- with gr.Row():
109
- with gr.Column():
110
- image_input = gr.Image(type="filepath", label="Subir Imagen")
111
- model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2")
112
- task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning")
113
- question_input = gr.Textbox(label="Pregunta VQA (opcional, solo para VQA)", placeholder="Ej: Are there people in the image?")
114
- submit_btn = gr.Button("Generar")
115
-
116
- with gr.Column():
117
- caption_output = gr.Textbox(label="Subtítulo Generado")
118
- vqa_output = gr.Textbox(label="Respuesta VQA")
119
- metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)")
120
-
121
- submit_btn.click(
122
- fn=infer,
123
- inputs=[image_input, model_dropdown, task_dropdown, question_input],
124
- outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="BLEU Score")]
125
- )
126
-
127
- gr.Markdown("### Notas")
128
- gr.Markdown("""
129
- - Este demo usa un subconjunto de imágenes de COCO 2017. Descarga las imágenes desde [cocodataset.org](http://cocodataset.org/#download) si deseas usar tus propias imágenes.
130
- - Requiere GPU para mejores resultados, pero funciona en CPU con tiempos más largos.
131
- - La métrica BLEU usa una referencia genérica y puede no reflejar la calidad real.
132
- - Para más detalles, consulta el [repositorio del paper](https://huggingface.co/datasets/pdro-ruiz/multimodal-llms-paper-2025).
133
- """)
134
-
135
- if __name__ == "__main__":
136
  demo.launch()
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ Idefics2Processor, Idefics2ForConditionalGeneration,
5
+ Blip2Processor, Blip2ForConditionalGeneration
6
+ )
7
+ from PIL import Image
8
+ import time
9
+ import pandas as pd
10
+ import nltk
11
+ from nltk.translate.bleu_score import sentence_bleu
12
+
13
+ # Descargar 'punkt' si no está disponible
14
+ try:
15
+ nltk.data.find("tokenizers/punkt")
16
+ except LookupError:
17
+ nltk.download("punkt")
18
+
19
+ # Configuración del dispositivo
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ print(f"Usando dispositivo: {device}")
22
+
23
+ # Definición de modelos
24
+ models = {
25
+ "IDEFICS2": {
26
+ "model_id": "HuggingFaceM4/idefics2-8b",
27
+ "processor_class": Idefics2Processor,
28
+ "model_class": Idefics2ForConditionalGeneration,
29
+ "caption_prompt": "<image>Describe the image in detail"
30
+ },
31
+ "BLIP2": {
32
+ "model_id": "Salesforce/blip2-opt-2.7b",
33
+ "processor_class": Blip2Processor,
34
+ "model_class": Blip2ForConditionalGeneration,
35
+ "caption_prompt": "" # Prompt vacío para BLIP2
36
+ }
37
+ }
38
+
39
+ # Cargar modelos (pre-cargados para evitar retrasos)
40
+ model_instances = {}
41
+ for model_name, config in models.items():
42
+ processor = config["processor_class"].from_pretrained(config["model_id"])
43
+ model = config["model_class"].from_pretrained(config["model_id"]).to(device)
44
+ model_instances[model_name] = (processor, model)
45
+
46
+ # Preguntas VQA predefinidas
47
+ vqa_questions = [
48
+ "Are there people in the image?",
49
+ "Which color predominates in the image?"
50
+ ]
51
+
52
+ # Referencia genérica para BLEU (puedes ajustar según necesidades)
53
+ reference_caption = ["An image with people and various objects"]
54
+
55
+ def infer(image, model_name, task, question=None):
56
+ if image is None:
57
+ return "Por favor, sube una imagen.", None, None, None, None, None
58
+
59
+ # Abrir y preparar la imagen
60
+ image = Image.open(image).convert("RGB")
61
+ if "BLIP2" in model_name:
62
+ image = image.resize((224, 224))
63
+
64
+ processor, model = model_instances[model_name]
65
+
66
+ start_time = time.time()
67
+ vram = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
68
+
69
+ if task == "captioning":
70
+ caption_prompt = models[model_name]["caption_prompt"]
71
+ caption_text = "" if "BLIP2" in model_name else caption_prompt
72
+ inputs = processor(images=image, text=caption_text, return_tensors="pt").to(device)
73
+ output_ids = model.generate(
74
+ **inputs,
75
+ max_new_tokens=50,
76
+ num_beams=5 if "BLIP2" in model_name else 1,
77
+ no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
78
+ )
79
+ caption = processor.decode(output_ids[0], skip_special_tokens=True)
80
+ inference_time = time.time() - start_time
81
+
82
+ # Calcular BLEU (simplificado, usando referencia genérica)
83
+ bleu_score = sentence_bleu([reference_caption[0].split()], caption.split()) if caption else 0.0
84
+
85
+ return (caption, inference_time, None, None, vram, bleu_score)
86
+
87
+ elif task == "vqa" and question:
88
+ vqa_text = question if "BLIP2" in model_name else f"<image>Q: {question}"
89
+ inputs = processor(images=image, text=vqa_text, return_tensors="pt").to(device)
90
+ output_ids = model.generate(
91
+ **inputs,
92
+ max_new_tokens=10,
93
+ num_beams=5 if "BLIP2" in model_name else 1,
94
+ no_repeat_ngram_size=2 if "BLIP2" in model_name else 0
95
+ )
96
+ vqa_answer = processor.decode(output_ids[0], skip_special_tokens=True)
97
+ inference_time = time.time() - start_time
98
+
99
+ return (None, None, vqa_answer, inference_time, vram, None)
100
+
101
+ return "Selecciona una tarea válida y, para VQA, una pregunta.", None, None, None, None, None
102
+
103
+ # Interfaz Gradio
104
+ with gr.Blocks(title="MLLM Benchmark Demo") as demo:
105
+ gr.Markdown("# Benchmark para Modelos Multimodales (MLLMs)")
106
+ gr.Markdown("Sube una imagen, selecciona un modelo y una tarea, y obtén resultados de captioning o VQA.")
107
+
108
+ with gr.Row():
109
+ with gr.Column():
110
+ image_input = gr.Image(type="filepath", label="Subir Imagen")
111
+ model_dropdown = gr.Dropdown(choices=["IDEFICS2", "BLIP2"], label="Seleccionar Modelo", value="IDEFICS2")
112
+ task_dropdown = gr.Dropdown(choices=["captioning", "vqa"], label="Seleccionar Tarea", value="captioning")
113
+ question_input = gr.Textbox(label="Pregunta VQA (opcional, solo para VQA)", placeholder="Ej: Are there people in the image?")
114
+ submit_btn = gr.Button("Generar")
115
+
116
+ with gr.Column():
117
+ caption_output = gr.Textbox(label="Subtítulo Generado")
118
+ vqa_output = gr.Textbox(label="Respuesta VQA")
119
+ metrics_output = gr.Textbox(label="Métricas (Tiempo, VRAM, BLEU)")
120
+
121
+ submit_btn.click(
122
+ fn=infer,
123
+ inputs=[image_input, model_dropdown, task_dropdown, question_input],
124
+ outputs=[caption_output, gr.Number(label="Tiempo Captioning (s)"), vqa_output, gr.Number(label="Tiempo VQA (s)"), gr.Number(label="VRAM (GB)"), gr.Number(label="BLEU Score")]
125
+ )
126
+
127
+ gr.Markdown("### Notas")
128
+ gr.Markdown("""
129
+ - para mejroar la velocidad de inferencia, descarga en local y usar GPU avanzada.
130
+ - La métrica BLEU usa una referencia genérica y puede no reflejar la calidad real.
131
+ - Para más detalles, consulta el [repositorio del paper](https://huggingface.co/spaces/Pdro-ruiz/MLLM_Estado_del_Arte_Feb25/tree/main).
132
+ """)
133
+
134
+ if __name__ == "__main__":
 
135
  demo.launch()