huckiyang commited on
Commit
35d0ee5
Β·
1 Parent(s): cf40e88

[data] add user types

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. app.py +41 -1
  3. src/about.py +1 -1
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
 
 
3
 
4
  from src.about import (
5
  CITATION_BUTTON_LABEL,
@@ -28,7 +30,7 @@ with demo:
28
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
29
 
30
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
31
- with gr.TabItem("Unified performance evaluation of VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
32
  with gr.Column():
33
  # 1. Display the table first
34
  # Make DataFrame interactive for sorting
@@ -163,6 +165,44 @@ with demo:
163
  # If you still want to show LLM_BENCHMARKS_TEXT, you can add it here, e.g.:
164
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  with gr.Row():
167
  with gr.Accordion("πŸ“™ Citation", open=False):
168
  citation_button = gr.Textbox(
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
 
6
  from src.about import (
7
  CITATION_BUTTON_LABEL,
 
30
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
31
 
32
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
33
+ with gr.TabItem("🧠 Unified performance evaluation of VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
34
  with gr.Column():
35
  # 1. Display the table first
36
  # Make DataFrame interactive for sorting
 
165
  # If you still want to show LLM_BENCHMARKS_TEXT, you can add it here, e.g.:
166
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
167
 
168
+ with gr.TabItem("πŸ§‘β€πŸ³ User Type and Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3):
169
+ with gr.Column():
170
+ gr.Markdown("### Preference-Oriented Scores by User Type and Model")
171
+
172
+ def create_preference_score_chart():
173
+ # User types and model names
174
+ user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused']
175
+ models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL']
176
+
177
+ # Data
178
+ scores = np.array([
179
+ [0.20, 0.35, 0.45, 0.50, 0.85], # Detail-oriented
180
+ [0.40, 0.55, 0.60, 0.60, 0.58], # Risk-conscious
181
+ [0.20, 0.65, 0.90, 0.70, 0.75] # Accuracy-focused
182
+ ])
183
+
184
+ x = np.arange(len(user_types))
185
+ width = 0.15
186
+
187
+ fig, ax = plt.subplots(figsize=(12, 7)) # Increased figure size for better readability
188
+
189
+ for i, model in enumerate(models):
190
+ ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model) # Centered bars
191
+
192
+ ax.set_xlabel('User type', fontsize=12)
193
+ ax.set_ylabel('Preference-oriented score', fontsize=12)
194
+ ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14)
195
+ ax.set_xticks(x)
196
+ ax.set_xticklabels(user_types, fontsize=10)
197
+ ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') # Legend outside plot
198
+
199
+ plt.ylim(0, 1.1)
200
+ plt.grid(axis='y', linestyle='--', alpha=0.7)
201
+ plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
202
+ return fig
203
+
204
+ gr.Plot(value=create_preference_score_chart)
205
+
206
  with gr.Row():
207
  with gr.Accordion("πŸ“™ Citation", open=False):
208
  citation_button = gr.Textbox(
src/about.py CHANGED
@@ -25,7 +25,7 @@ TITLE = """<h1 align="center" id="space-title">πŸͺ· LOTUS: A Leaderboard for Det
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ (ACL 2025 Industry Track) Large Vision-Language Models (LVLMs) have transformed image captioning, shifting from concise captions to detailed, context-rich descriptions. We introduce LOTUS, a unified leaderboard for evaluating such detailed captions, addressing three main gaps in existing evaluation approaches: lack of standardized criteria, absence of bias-aware assessments, and evaluations that disregard user preferences.
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?