huckiyang commited on
Commit
21e142e
·
1 Parent(s): 6f6fc25

[rank] adding rank

Browse files
Files changed (2) hide show
  1. app.py +56 -49
  2. leaderboard_data.csv +1 -1
app.py CHANGED
@@ -19,6 +19,51 @@ from src.display.css_html_js import custom_css
19
  # Load leaderboard data with multi-header, do not set index initially
20
  LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1])
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Function to prepare DataFrame for display (format headers, ensure Model column)
23
  def format_leaderboard_df_for_display(df_orig):
24
  df_display = df_orig.copy()
@@ -40,22 +85,16 @@ BIAS_DF = BIAS_DF.astype(str).fillna("-")
40
  demo = gr.Blocks(css=custom_css)
41
  with demo:
42
  gr.HTML(TITLE)
43
- gr.HTML("""<div style="text-align: center;">
44
- <img src="https://huggingface.co/spaces/nvidia/LOTUS-VLM-Bias/blob/main/overview-acl-25.png" alt="Overview ACL 2025" style="width: 75%;">
45
- </div>""")
46
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
47
 
48
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
49
  with gr.TabItem("🧠 Unified performance evaluation of VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
50
  with gr.Column():
51
- # gr.Image("table_snapshot.png", label="Original Table Snapshot", interactive=False) # Removed this line
52
-
53
  table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True)
54
 
55
  gr.Markdown("---")
56
  gr.Markdown("### Display Options")
57
 
58
- # Filter choices from the original DataFrame's ('Model', 'Model') column
59
  model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist()
60
  model_selector = gr.CheckboxGroup(
61
  choices=model_filter_choices,
@@ -66,8 +105,7 @@ with demo:
66
  def update_table(selected_models_from_filter):
67
  filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy()
68
  if not selected_models_from_filter:
69
- # If no models selected, show an empty table structure (based on original for cols)
70
- filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])] # Empty but keeps structure
71
  else:
72
  valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices]
73
  if not valid_selected_models:
@@ -75,7 +113,6 @@ with demo:
75
  else:
76
  filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)]
77
 
78
- # Format the filtered DataFrame for display
79
  df_to_display = format_leaderboard_df_for_display(filtered_df_orig)
80
  return gr.DataFrame.update(value=df_to_display)
81
 
@@ -87,54 +124,41 @@ with demo:
87
 
88
  with gr.TabItem("📝 Bias-aware evaluation of VLM ", elem_id="llm-benchmark-tab-table", id=2):
89
  with gr.Column():
90
- gr.Markdown("### Bias-Aware Evaluation Results") # Title for this section
91
  bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True)
92
-
93
  gr.Markdown("---")
94
  gr.Markdown("### Display Options for Bias Table")
95
-
96
  bias_all_columns_list = BIAS_DF.columns.tolist()
97
  bias_column_selector = gr.CheckboxGroup(
98
  choices=bias_all_columns_list,
99
  value=bias_all_columns_list,
100
  label="Select Columns to Display:"
101
  )
102
-
103
- # Filter by Bias_Type
104
  bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else []
105
  bias_type_selector = gr.CheckboxGroup(
106
  choices=bias_type_filter_choices,
107
  value=bias_type_filter_choices,
108
  label="Filter by Bias Type:"
109
  )
110
-
111
- # Filter by Model (for the bias table)
112
  bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else []
113
  bias_model_selector_for_bias_tab = gr.CheckboxGroup(
114
  choices=bias_model_filter_choices,
115
  value=bias_model_filter_choices,
116
  label="Filter by Model:"
117
  )
118
-
119
  def update_bias_table(selected_cols, selected_bias_types, selected_models):
120
  temp_df = BIAS_DF.copy()
121
-
122
  if selected_bias_types and "Bias_Type" in temp_df.columns:
123
  temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)]
124
- elif not selected_bias_types and "Bias_Type" in temp_df.columns: # No bias types selected
125
  temp_df = pd.DataFrame(columns=BIAS_DF.columns)
126
-
127
  if selected_models and "Model" in temp_df.columns:
128
  temp_df = temp_df[temp_df["Model"].isin(selected_models)]
129
- elif not selected_models and "Model" in temp_df.columns: # No models selected
130
- # If already filtered by bias_type, maintain that, else show empty based on model filter
131
- if not selected_bias_types: # If bias types also not selected, then empty
132
  temp_df = pd.DataFrame(columns=BIAS_DF.columns)
133
- # if selected_bias_types IS populated, then it means we want all models for those bias types
134
- # but if selected_models is empty, it means filter to show NO models, hence the following line:
135
  elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any():
136
- temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())] # effectively show no models for selected bias types
137
-
138
  valid_selected_cols = [col for col in selected_cols if col in temp_df.columns]
139
  if not valid_selected_cols and not temp_df.empty:
140
  final_df = temp_df
@@ -142,53 +166,37 @@ with demo:
142
  final_df = pd.DataFrame(columns=selected_cols)
143
  else:
144
  final_df = temp_df[valid_selected_cols]
145
-
146
  return gr.DataFrame.update(value=final_df)
147
-
148
  bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
149
  bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
150
  bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
151
-
152
- # The original gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") is replaced by the table and its controls.
153
- # If you still want to show LLM_BENCHMARKS_TEXT, you can add it here, e.g.:
154
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
155
 
156
  with gr.TabItem("🧑‍🍳 User Type and Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3):
157
  with gr.Column():
158
  gr.Markdown("### Preference-Oriented Scores by User Type and Model")
159
-
160
  def create_preference_score_chart():
161
- # User types and model names
162
  user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused']
163
  models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL']
164
-
165
- # Data
166
  scores = np.array([
167
  [0.20, 0.35, 0.45, 0.50, 0.85], # Detail-oriented
168
  [0.40, 0.55, 0.67, 0.53, 0.58], # Risk-conscious
169
  [0.20, 0.60, 0.72, 0.69, 0.75] # Accuracy-focused
170
  ])
171
-
172
  x = np.arange(len(user_types))
173
  width = 0.15
174
-
175
- fig, ax = plt.subplots(figsize=(12, 7)) # Increased figure size for better readability
176
-
177
  for i, model in enumerate(models):
178
- ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model) # Centered bars
179
-
180
  ax.set_xlabel('User type', fontsize=12)
181
  ax.set_ylabel('Preference-oriented score', fontsize=12)
182
  ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14)
183
  ax.set_xticks(x)
184
  ax.set_xticklabels(user_types, fontsize=10)
185
- ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') # Legend outside plot
186
-
187
  plt.ylim(0, 1.1)
188
  plt.grid(axis='y', linestyle='--', alpha=0.7)
189
- plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
190
  return fig
191
-
192
  gr.Plot(value=create_preference_score_chart)
193
 
194
  with gr.Row():
@@ -201,8 +209,7 @@ with demo:
201
  show_copy_button=True,
202
  )
203
 
204
- # Add a button to link to the Hugging Face discussion page
205
- gr.Markdown("---") # Visual separator
206
  link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew"
207
  gr.HTML(f'''
208
  <div style="text-align: center; margin-top: 20px; margin-bottom: 20px;">
 
19
  # Load leaderboard data with multi-header, do not set index initially
20
  LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1])
21
 
22
+ # Calculate Average N-avg and Rank
23
+ # Identify N-avg columns (adjust if names are different in CSV header row 2)
24
+ n_avg_cols_to_average = [
25
+ ('Alignment', 'N-avg↑'),
26
+ ('Descriptiveness', 'N-avg↑'),
27
+ ('Complexity', 'N-avg↑'),
28
+ ('Side effects', 'N-avg↑')
29
+ ]
30
+
31
+ # Ensure these columns are numeric, coercing errors to NaN (though they should be numbers)
32
+ for col_tuple in n_avg_cols_to_average:
33
+ if col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
34
+ LEADERBOARD_DF_ORIGINAL[col_tuple] = pd.to_numeric(LEADERBOARD_DF_ORIGINAL[col_tuple], errors='coerce')
35
+ else:
36
+ print(f"Warning: N-avg column {col_tuple} not found for averaging.") # Add a warning
37
+
38
+ # Calculate average, handling cases where some N-avg columns might be missing
39
+ existing_n_avg_cols = [col for col in n_avg_cols_to_average if col in LEADERBOARD_DF_ORIGINAL.columns]
40
+ if existing_n_avg_cols:
41
+ LEADERBOARD_DF_ORIGINAL[('Overall', 'Average N-avg')] = LEADERBOARD_DF_ORIGINAL[existing_n_avg_cols].mean(axis=1)
42
+ LEADERBOARD_DF_ORIGINAL[('Overall', 'Rank')] = LEADERBOARD_DF_ORIGINAL[('Overall', 'Average N-avg')].rank(method='min', ascending=False).astype(int)
43
+ else:
44
+ LEADERBOARD_DF_ORIGINAL[('Overall', 'Average N-avg')] = np.nan
45
+ LEADERBOARD_DF_ORIGINAL[('Overall', 'Rank')] = np.nan
46
+
47
+
48
+ # Reorder columns to put Rank and Average N-avg first, then Model, then the rest
49
+ model_col_tuple = ('Model', 'Model') # Original name of the model column
50
+ rank_col_tuple = ('Overall', 'Rank')
51
+ avg_navg_col_tuple = ('Overall', 'Average N-avg')
52
+
53
+ new_col_order = []
54
+ if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
55
+ new_col_order.append(rank_col_tuple)
56
+ if avg_navg_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
57
+ new_col_order.append(avg_navg_col_tuple)
58
+ if model_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
59
+ new_col_order.append(model_col_tuple)
60
+
61
+ for col in LEADERBOARD_DF_ORIGINAL.columns:
62
+ if col not in new_col_order:
63
+ new_col_order.append(col)
64
+ LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL[new_col_order]
65
+
66
+
67
  # Function to prepare DataFrame for display (format headers, ensure Model column)
68
  def format_leaderboard_df_for_display(df_orig):
69
  df_display = df_orig.copy()
 
85
  demo = gr.Blocks(css=custom_css)
86
  with demo:
87
  gr.HTML(TITLE)
 
 
 
88
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
89
 
90
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
91
  with gr.TabItem("🧠 Unified performance evaluation of VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
92
  with gr.Column():
 
 
93
  table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True)
94
 
95
  gr.Markdown("---")
96
  gr.Markdown("### Display Options")
97
 
 
98
  model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist()
99
  model_selector = gr.CheckboxGroup(
100
  choices=model_filter_choices,
 
105
  def update_table(selected_models_from_filter):
106
  filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy()
107
  if not selected_models_from_filter:
108
+ filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])]
 
109
  else:
110
  valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices]
111
  if not valid_selected_models:
 
113
  else:
114
  filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)]
115
 
 
116
  df_to_display = format_leaderboard_df_for_display(filtered_df_orig)
117
  return gr.DataFrame.update(value=df_to_display)
118
 
 
124
 
125
  with gr.TabItem("📝 Bias-aware evaluation of VLM ", elem_id="llm-benchmark-tab-table", id=2):
126
  with gr.Column():
127
+ gr.Markdown("### Bias-Aware Evaluation Results")
128
  bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True)
 
129
  gr.Markdown("---")
130
  gr.Markdown("### Display Options for Bias Table")
 
131
  bias_all_columns_list = BIAS_DF.columns.tolist()
132
  bias_column_selector = gr.CheckboxGroup(
133
  choices=bias_all_columns_list,
134
  value=bias_all_columns_list,
135
  label="Select Columns to Display:"
136
  )
 
 
137
  bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else []
138
  bias_type_selector = gr.CheckboxGroup(
139
  choices=bias_type_filter_choices,
140
  value=bias_type_filter_choices,
141
  label="Filter by Bias Type:"
142
  )
 
 
143
  bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else []
144
  bias_model_selector_for_bias_tab = gr.CheckboxGroup(
145
  choices=bias_model_filter_choices,
146
  value=bias_model_filter_choices,
147
  label="Filter by Model:"
148
  )
 
149
  def update_bias_table(selected_cols, selected_bias_types, selected_models):
150
  temp_df = BIAS_DF.copy()
 
151
  if selected_bias_types and "Bias_Type" in temp_df.columns:
152
  temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)]
153
+ elif not selected_bias_types and "Bias_Type" in temp_df.columns:
154
  temp_df = pd.DataFrame(columns=BIAS_DF.columns)
 
155
  if selected_models and "Model" in temp_df.columns:
156
  temp_df = temp_df[temp_df["Model"].isin(selected_models)]
157
+ elif not selected_models and "Model" in temp_df.columns:
158
+ if not selected_bias_types:
 
159
  temp_df = pd.DataFrame(columns=BIAS_DF.columns)
 
 
160
  elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any():
161
+ temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())]
 
162
  valid_selected_cols = [col for col in selected_cols if col in temp_df.columns]
163
  if not valid_selected_cols and not temp_df.empty:
164
  final_df = temp_df
 
166
  final_df = pd.DataFrame(columns=selected_cols)
167
  else:
168
  final_df = temp_df[valid_selected_cols]
 
169
  return gr.DataFrame.update(value=final_df)
 
170
  bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
171
  bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
172
  bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
 
 
 
 
173
 
174
  with gr.TabItem("🧑‍🍳 User Type and Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3):
175
  with gr.Column():
176
  gr.Markdown("### Preference-Oriented Scores by User Type and Model")
 
177
  def create_preference_score_chart():
 
178
  user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused']
179
  models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL']
 
 
180
  scores = np.array([
181
  [0.20, 0.35, 0.45, 0.50, 0.85], # Detail-oriented
182
  [0.40, 0.55, 0.67, 0.53, 0.58], # Risk-conscious
183
  [0.20, 0.60, 0.72, 0.69, 0.75] # Accuracy-focused
184
  ])
 
185
  x = np.arange(len(user_types))
186
  width = 0.15
187
+ fig, ax = plt.subplots(figsize=(12, 7))
 
 
188
  for i, model in enumerate(models):
189
+ ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model)
 
190
  ax.set_xlabel('User type', fontsize=12)
191
  ax.set_ylabel('Preference-oriented score', fontsize=12)
192
  ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14)
193
  ax.set_xticks(x)
194
  ax.set_xticklabels(user_types, fontsize=10)
195
+ ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
 
196
  plt.ylim(0, 1.1)
197
  plt.grid(axis='y', linestyle='--', alpha=0.7)
198
+ plt.tight_layout(rect=[0, 0, 0.85, 1])
199
  return fig
 
200
  gr.Plot(value=create_preference_score_chart)
201
 
202
  with gr.Row():
 
209
  show_copy_button=True,
210
  )
211
 
212
+ gr.Markdown("---")
 
213
  link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew"
214
  gr.HTML(f'''
215
  <div style="text-align: center; margin-top: 20px; margin-bottom: 20px;">
leaderboard_data.csv CHANGED
@@ -1,5 +1,5 @@
1
  Model,Alignment,Alignment,Alignment,Alignment,Descriptiveness,Descriptiveness,Descriptiveness,Descriptiveness,Complexity,Complexity,Complexity,Side effects,Side effects,Side effects,Side effects,Side effects
2
- Model,CLIP-S,CapS_S,CapS_A,N-avg,Recall,Noun,Verb,N-avg,Syn,Sem,N-avg,CHs↓,FS↑,FSs↑,Harm↓,N-avg↑
3
  MiniGPT-4,60.8,33.0,35.9,0.19,75.3,33.0,34.7,0.22,8.0,32.6,0.38,37.8,55.0,37.6,0.31,0.18
4
  InstructBLIP,59.9,36.0,35.5,0.18,82.1,34.2,34.7,0.40,7.7,46.0,0.41,58.5,62.4,43.3,0.10,0.66
5
  LLaVA-1.5,60.1,38.5,45.0,0.67,80.5,32.5,31.0,0.11,7.1,39.6,0.08,49.0,65.7,41.6,0.12,0.71
 
1
  Model,Alignment,Alignment,Alignment,Alignment,Descriptiveness,Descriptiveness,Descriptiveness,Descriptiveness,Complexity,Complexity,Complexity,Side effects,Side effects,Side effects,Side effects,Side effects
2
+ Model,CLIP-S,CapS_S,CapS_A,N-avg↑,Recall,Noun,Verb,N-avg↑,Syn,Sem,N-avg↑,CHs↓,FS↑,FSs↑,Harm↓,N-avg↑
3
  MiniGPT-4,60.8,33.0,35.9,0.19,75.3,33.0,34.7,0.22,8.0,32.6,0.38,37.8,55.0,37.6,0.31,0.18
4
  InstructBLIP,59.9,36.0,35.5,0.18,82.1,34.2,34.7,0.40,7.7,46.0,0.41,58.5,62.4,43.3,0.10,0.66
5
  LLaVA-1.5,60.1,38.5,45.0,0.67,80.5,32.5,31.0,0.11,7.1,39.6,0.08,49.0,65.7,41.6,0.12,0.71