Spaces:

nvidia
/

lotus-vlm-bias-leaderboard

Running

App Files Files Community

huckiyang commited on 16 days ago

Commit

21e142e

1 Parent(s): 6f6fc25

[rank] adding rank

Browse files

Files changed (2) hide show

app.py +56 -49
leaderboard_data.csv +1 -1

app.py CHANGED Viewed

@@ -19,6 +19,51 @@ from src.display.css_html_js import custom_css
 # Load leaderboard data with multi-header, do not set index initially
 LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1])
 # Function to prepare DataFrame for display (format headers, ensure Model column)
 def format_leaderboard_df_for_display(df_orig):
     df_display = df_orig.copy()
@@ -40,22 +85,16 @@ BIAS_DF = BIAS_DF.astype(str).fillna("-")
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
-    gr.HTML("""<div style="text-align: center;">
-                  <img src="https://huggingface.co/spaces/nvidia/LOTUS-VLM-Bias/blob/main/overview-acl-25.png" alt="Overview ACL 2025" style="width: 75%;">
-              </div>""")
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🧠 Unified performance evaluation of VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Column():
-                # gr.Image("table_snapshot.png", label="Original Table Snapshot", interactive=False) # Removed this line
                 table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True)
                 gr.Markdown("---")
                 gr.Markdown("### Display Options")
-                # Filter choices from the original DataFrame's ('Model', 'Model') column
                 model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist()
                 model_selector = gr.CheckboxGroup(
                     choices=model_filter_choices,
@@ -66,8 +105,7 @@ with demo:
                 def update_table(selected_models_from_filter):
                     filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy()
                     if not selected_models_from_filter:
-                        # If no models selected, show an empty table structure (based on original for cols)
-                        filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])] # Empty but keeps structure
                     else:
                         valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices]
                         if not valid_selected_models:
@@ -75,7 +113,6 @@ with demo:
                         else:
                             filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)]
-                    # Format the filtered DataFrame for display
                     df_to_display = format_leaderboard_df_for_display(filtered_df_orig)
                     return gr.DataFrame.update(value=df_to_display)
@@ -87,54 +124,41 @@ with demo:
         with gr.TabItem("📝 Bias-aware evaluation of VLM ", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Column():
-                gr.Markdown("### Bias-Aware Evaluation Results") # Title for this section
                 bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True)
                 gr.Markdown("---")
                 gr.Markdown("### Display Options for Bias Table")
                 bias_all_columns_list = BIAS_DF.columns.tolist()
                 bias_column_selector = gr.CheckboxGroup(
                     choices=bias_all_columns_list,
                     value=bias_all_columns_list,
                     label="Select Columns to Display:"
                 )
-                # Filter by Bias_Type
                 bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else []
                 bias_type_selector = gr.CheckboxGroup(
                     choices=bias_type_filter_choices,
                     value=bias_type_filter_choices,
                     label="Filter by Bias Type:"
                 )
-                # Filter by Model (for the bias table)
                 bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else []
                 bias_model_selector_for_bias_tab = gr.CheckboxGroup(
                     choices=bias_model_filter_choices,
                     value=bias_model_filter_choices,
                     label="Filter by Model:"
                 )
                 def update_bias_table(selected_cols, selected_bias_types, selected_models):
                     temp_df = BIAS_DF.copy()
                     if selected_bias_types and "Bias_Type" in temp_df.columns:
                         temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)]
-                    elif not selected_bias_types and "Bias_Type" in temp_df.columns: # No bias types selected
                         temp_df = pd.DataFrame(columns=BIAS_DF.columns)
                     if selected_models and "Model" in temp_df.columns:
                         temp_df = temp_df[temp_df["Model"].isin(selected_models)]
-                    elif not selected_models and "Model" in temp_df.columns: # No models selected
-                        # If already filtered by bias_type, maintain that, else show empty based on model filter
-                        if not selected_bias_types: # If bias types also not selected, then empty
                              temp_df = pd.DataFrame(columns=BIAS_DF.columns)
-                        # if selected_bias_types IS populated, then it means we want all models for those bias types
-                        # but if selected_models is empty, it means filter to show NO models, hence the following line:
                         elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any():
-                            temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())] # effectively show no models for selected bias types
                     valid_selected_cols = [col for col in selected_cols if col in temp_df.columns]
                     if not valid_selected_cols and not temp_df.empty:
                         final_df = temp_df
@@ -142,53 +166,37 @@ with demo:
                         final_df = pd.DataFrame(columns=selected_cols)
                     else:
                         final_df = temp_df[valid_selected_cols]
                     return gr.DataFrame.update(value=final_df)
                 bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
                 bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
                 bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
-            # The original gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") is replaced by the table and its controls.
-            # If you still want to show LLM_BENCHMARKS_TEXT, you can add it here, e.g.:
-            # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🧑‍🍳 User Type and Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 gr.Markdown("### Preference-Oriented Scores by User Type and Model")
                 def create_preference_score_chart():
-                    # User types and model names
                     user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused']
                     models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL']
-                    # Data
                     scores = np.array([
                         [0.20, 0.35, 0.45, 0.50, 0.85],  # Detail-oriented
                         [0.40, 0.55, 0.67, 0.53, 0.58],  # Risk-conscious
                         [0.20, 0.60, 0.72, 0.69, 0.75]   # Accuracy-focused
                     ])
                     x = np.arange(len(user_types))
                     width = 0.15
-                    fig, ax = plt.subplots(figsize=(12, 7)) # Increased figure size for better readability
                     for i, model in enumerate(models):
-                        ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model) # Centered bars
                     ax.set_xlabel('User type', fontsize=12)
                     ax.set_ylabel('Preference-oriented score', fontsize=12)
                     ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14)
                     ax.set_xticks(x)
                     ax.set_xticklabels(user_types, fontsize=10)
-                    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') # Legend outside plot
                     plt.ylim(0, 1.1)
                     plt.grid(axis='y', linestyle='--', alpha=0.7)
-                    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout to make space for legend
                     return fig
                 gr.Plot(value=create_preference_score_chart)
     with gr.Row():
@@ -201,8 +209,7 @@ with demo:
                 show_copy_button=True,
             )
-    # Add a button to link to the Hugging Face discussion page
-    gr.Markdown("---") # Visual separator
     link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew"
     gr.HTML(f'''
         <div style="text-align: center; margin-top: 20px; margin-bottom: 20px;">

 # Load leaderboard data with multi-header, do not set index initially
 LEADERBOARD_DF_ORIGINAL = pd.read_csv("leaderboard_data.csv", header=[0, 1])
+# Calculate Average N-avg and Rank
+# Identify N-avg columns (adjust if names are different in CSV header row 2)
+n_avg_cols_to_average = [
+    ('Alignment', 'N-avg↑'),
+    ('Descriptiveness', 'N-avg↑'),
+    ('Complexity', 'N-avg↑'),
+    ('Side effects', 'N-avg↑')
+]
+# Ensure these columns are numeric, coercing errors to NaN (though they should be numbers)
+for col_tuple in n_avg_cols_to_average:
+    if col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
+        LEADERBOARD_DF_ORIGINAL[col_tuple] = pd.to_numeric(LEADERBOARD_DF_ORIGINAL[col_tuple], errors='coerce')
+    else:
+        print(f"Warning: N-avg column {col_tuple} not found for averaging.") # Add a warning
+# Calculate average, handling cases where some N-avg columns might be missing
+existing_n_avg_cols = [col for col in n_avg_cols_to_average if col in LEADERBOARD_DF_ORIGINAL.columns]
+if existing_n_avg_cols:
+    LEADERBOARD_DF_ORIGINAL[('Overall', 'Average N-avg')] = LEADERBOARD_DF_ORIGINAL[existing_n_avg_cols].mean(axis=1)
+    LEADERBOARD_DF_ORIGINAL[('Overall', 'Rank')] = LEADERBOARD_DF_ORIGINAL[('Overall', 'Average N-avg')].rank(method='min', ascending=False).astype(int)
+else:
+    LEADERBOARD_DF_ORIGINAL[('Overall', 'Average N-avg')] = np.nan
+    LEADERBOARD_DF_ORIGINAL[('Overall', 'Rank')] = np.nan
+# Reorder columns to put Rank and Average N-avg first, then Model, then the rest
+model_col_tuple = ('Model', 'Model') # Original name of the model column
+rank_col_tuple = ('Overall', 'Rank')
+avg_navg_col_tuple = ('Overall', 'Average N-avg')
+new_col_order = []
+if rank_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
+    new_col_order.append(rank_col_tuple)
+if avg_navg_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
+    new_col_order.append(avg_navg_col_tuple)
+if model_col_tuple in LEADERBOARD_DF_ORIGINAL.columns:
+    new_col_order.append(model_col_tuple)
+for col in LEADERBOARD_DF_ORIGINAL.columns:
+    if col not in new_col_order:
+        new_col_order.append(col)
+LEADERBOARD_DF_ORIGINAL = LEADERBOARD_DF_ORIGINAL[new_col_order]
 # Function to prepare DataFrame for display (format headers, ensure Model column)
 def format_leaderboard_df_for_display(df_orig):
     df_display = df_orig.copy()
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🧠 Unified performance evaluation of VLM captioners", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Column():
                 table_output = gr.DataFrame(value=LEADERBOARD_DF_DISPLAY_INIT, label="Leaderboard Results", interactive=True, wrap=True)
                 gr.Markdown("---")
                 gr.Markdown("### Display Options")
                 model_filter_choices = LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].unique().tolist()
                 model_selector = gr.CheckboxGroup(
                     choices=model_filter_choices,
                 def update_table(selected_models_from_filter):
                     filtered_df_orig = LEADERBOARD_DF_ORIGINAL.copy()
                     if not selected_models_from_filter:
+                        filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin([])]
                     else:
                         valid_selected_models = [model for model in selected_models_from_filter if model in model_filter_choices]
                         if not valid_selected_models:
                         else:
                             filtered_df_orig = LEADERBOARD_DF_ORIGINAL[LEADERBOARD_DF_ORIGINAL[('Model', 'Model')].isin(valid_selected_models)]
                     df_to_display = format_leaderboard_df_for_display(filtered_df_orig)
                     return gr.DataFrame.update(value=df_to_display)
         with gr.TabItem("📝 Bias-aware evaluation of VLM ", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Column():
+                gr.Markdown("### Bias-Aware Evaluation Results")
                 bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True)
                 gr.Markdown("---")
                 gr.Markdown("### Display Options for Bias Table")
                 bias_all_columns_list = BIAS_DF.columns.tolist()
                 bias_column_selector = gr.CheckboxGroup(
                     choices=bias_all_columns_list,
                     value=bias_all_columns_list,
                     label="Select Columns to Display:"
                 )
                 bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else []
                 bias_type_selector = gr.CheckboxGroup(
                     choices=bias_type_filter_choices,
                     value=bias_type_filter_choices,
                     label="Filter by Bias Type:"
                 )
                 bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else []
                 bias_model_selector_for_bias_tab = gr.CheckboxGroup(
                     choices=bias_model_filter_choices,
                     value=bias_model_filter_choices,
                     label="Filter by Model:"
                 )
                 def update_bias_table(selected_cols, selected_bias_types, selected_models):
                     temp_df = BIAS_DF.copy()
                     if selected_bias_types and "Bias_Type" in temp_df.columns:
                         temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)]
+                    elif not selected_bias_types and "Bias_Type" in temp_df.columns:
                         temp_df = pd.DataFrame(columns=BIAS_DF.columns)
                     if selected_models and "Model" in temp_df.columns:
                         temp_df = temp_df[temp_df["Model"].isin(selected_models)]
+                    elif not selected_models and "Model" in temp_df.columns:
+                        if not selected_bias_types:
                              temp_df = pd.DataFrame(columns=BIAS_DF.columns)
                         elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any():
+                            temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())]
                     valid_selected_cols = [col for col in selected_cols if col in temp_df.columns]
                     if not valid_selected_cols and not temp_df.empty:
                         final_df = temp_df
                         final_df = pd.DataFrame(columns=selected_cols)
                     else:
                         final_df = temp_df[valid_selected_cols]
                     return gr.DataFrame.update(value=final_df)
                 bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
                 bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
                 bias_model_selector_for_bias_tab.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector_for_bias_tab], outputs=[bias_table_output])
         with gr.TabItem("🧑‍🍳 User Type and Preference-Oriented Scores ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 gr.Markdown("### Preference-Oriented Scores by User Type and Model")
                 def create_preference_score_chart():
                     user_types = ['Detail-oriented', 'Risk-conscious', 'Accuracy-focused']
                     models = ['MiniGPT-4', 'InstructBLIP', 'LLaVA-1.5', 'mPLUG-Owl2', 'Qwen2-VL']
                     scores = np.array([
                         [0.20, 0.35, 0.45, 0.50, 0.85],  # Detail-oriented
                         [0.40, 0.55, 0.67, 0.53, 0.58],  # Risk-conscious
                         [0.20, 0.60, 0.72, 0.69, 0.75]   # Accuracy-focused
                     ])
                     x = np.arange(len(user_types))
                     width = 0.15
+                    fig, ax = plt.subplots(figsize=(12, 7))
                     for i, model in enumerate(models):
+                        ax.bar(x + i * width - (width * (len(models)-1)/2), scores[:, i], width, label=model)
                     ax.set_xlabel('User type', fontsize=12)
                     ax.set_ylabel('Preference-oriented score', fontsize=12)
                     ax.set_title('Preference-oriented scores by User Type and Model', fontsize=14)
                     ax.set_xticks(x)
                     ax.set_xticklabels(user_types, fontsize=10)
+                    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
                     plt.ylim(0, 1.1)
                     plt.grid(axis='y', linestyle='--', alpha=0.7)
+                    plt.tight_layout(rect=[0, 0, 0.85, 1])
                     return fig
                 gr.Plot(value=create_preference_score_chart)
     with gr.Row():
                 show_copy_button=True,
             )
+    gr.Markdown("---")
     link_to_discussion = "https://huggingface.co/login?next=%2Fspaces%2Fnvidia%2FLOTUS-VLM-Bias%2Fdiscussions%2Fnew"
     gr.HTML(f'''
         <div style="text-align: center; margin-top: 20px; margin-bottom: 20px;">

leaderboard_data.csv CHANGED Viewed

@@ -1,5 +1,5 @@
 Model,Alignment,Alignment,Alignment,Alignment,Descriptiveness,Descriptiveness,Descriptiveness,Descriptiveness,Complexity,Complexity,Complexity,Side effects,Side effects,Side effects,Side effects,Side effects
-Model,CLIP-S,CapS_S,CapS_A,N-avg,Recall,Noun,Verb,N-avg,Syn,Sem,N-avg,CHs↓,FS↑,FSs↑,Harm↓,N-avg↑
 MiniGPT-4,60.8,33.0,35.9,0.19,75.3,33.0,34.7,0.22,8.0,32.6,0.38,37.8,55.0,37.6,0.31,0.18
 InstructBLIP,59.9,36.0,35.5,0.18,82.1,34.2,34.7,0.40,7.7,46.0,0.41,58.5,62.4,43.3,0.10,0.66
 LLaVA-1.5,60.1,38.5,45.0,0.67,80.5,32.5,31.0,0.11,7.1,39.6,0.08,49.0,65.7,41.6,0.12,0.71

 Model,Alignment,Alignment,Alignment,Alignment,Descriptiveness,Descriptiveness,Descriptiveness,Descriptiveness,Complexity,Complexity,Complexity,Side effects,Side effects,Side effects,Side effects,Side effects
+Model,CLIP-S,CapS_S,CapS_A,N-avg↑,Recall,Noun,Verb,N-avg↑,Syn,Sem,N-avg↑,CHs↓,FS↑,FSs↑,Harm↓,N-avg↑
 MiniGPT-4,60.8,33.0,35.9,0.19,75.3,33.0,34.7,0.22,8.0,32.6,0.38,37.8,55.0,37.6,0.31,0.18
 InstructBLIP,59.9,36.0,35.5,0.18,82.1,34.2,34.7,0.40,7.7,46.0,0.41,58.5,62.4,43.3,0.10,0.66
 LLaVA-1.5,60.1,38.5,45.0,0.67,80.5,32.5,31.0,0.11,7.1,39.6,0.08,49.0,65.7,41.6,0.12,0.71