[update] table 2
Browse files- app.py +69 -1
- bias_evaluation_data.csv +16 -0
app.py
CHANGED
@@ -18,6 +18,9 @@ LEADERBOARD_DF = pd.read_csv("leaderboard_data.csv")
|
|
18 |
# Ensure all data is treated as string initially for display consistency
|
19 |
LEADERBOARD_DF = LEADERBOARD_DF.astype(str)
|
20 |
|
|
|
|
|
|
|
21 |
|
22 |
demo = gr.Blocks(css=custom_css)
|
23 |
with demo:
|
@@ -93,7 +96,72 @@ with demo:
|
|
93 |
)
|
94 |
|
95 |
with gr.TabItem("π Bias-aware evaluation of VLM ", elem_id="llm-benchmark-tab-table", id=2):
|
96 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
with gr.Row():
|
99 |
with gr.Accordion("π Citation", open=False):
|
|
|
18 |
# Ensure all data is treated as string initially for display consistency
|
19 |
LEADERBOARD_DF = LEADERBOARD_DF.astype(str)
|
20 |
|
21 |
+
BIAS_DF = pd.read_csv("bias_evaluation_data.csv")
|
22 |
+
BIAS_DF = BIAS_DF.astype(str).fillna("-") # Fill NaN with '-' to be consistent with how it looks in the LaTeX table
|
23 |
+
|
24 |
|
25 |
demo = gr.Blocks(css=custom_css)
|
26 |
with demo:
|
|
|
96 |
)
|
97 |
|
98 |
with gr.TabItem("π Bias-aware evaluation of VLM ", elem_id="llm-benchmark-tab-table", id=2):
|
99 |
+
with gr.Column():
|
100 |
+
gr.Markdown("### Bias-Aware Evaluation Results") # Title for this section
|
101 |
+
bias_table_output = gr.DataFrame(value=BIAS_DF, label="Bias Evaluation Results", interactive=True, wrap=True)
|
102 |
+
|
103 |
+
gr.Markdown("---")
|
104 |
+
gr.Markdown("### Display Options for Bias Table")
|
105 |
+
|
106 |
+
bias_all_columns_list = BIAS_DF.columns.tolist()
|
107 |
+
bias_column_selector = gr.CheckboxGroup(
|
108 |
+
choices=bias_all_columns_list,
|
109 |
+
value=bias_all_columns_list,
|
110 |
+
label="Select Columns to Display:"
|
111 |
+
)
|
112 |
+
|
113 |
+
# Filter by Bias_Type
|
114 |
+
bias_type_filter_choices = BIAS_DF["Bias_Type"].unique().tolist() if "Bias_Type" in BIAS_DF.columns else []
|
115 |
+
bias_type_selector = gr.CheckboxGroup(
|
116 |
+
choices=bias_type_filter_choices,
|
117 |
+
value=bias_type_filter_choices,
|
118 |
+
label="Filter by Bias Type:"
|
119 |
+
)
|
120 |
+
|
121 |
+
# Filter by Model (for the bias table)
|
122 |
+
bias_model_filter_choices = BIAS_DF["Model"].unique().tolist() if "Model" in BIAS_DF.columns else []
|
123 |
+
bias_model_selector = gr.CheckboxGroup(
|
124 |
+
choices=bias_model_filter_choices,
|
125 |
+
value=bias_model_filter_choices,
|
126 |
+
label="Filter by Model:"
|
127 |
+
)
|
128 |
+
|
129 |
+
def update_bias_table(selected_cols, selected_bias_types, selected_models):
|
130 |
+
temp_df = BIAS_DF.copy()
|
131 |
+
|
132 |
+
if selected_bias_types and "Bias_Type" in temp_df.columns:
|
133 |
+
temp_df = temp_df[temp_df["Bias_Type"].isin(selected_bias_types)]
|
134 |
+
elif not selected_bias_types and "Bias_Type" in temp_df.columns: # No bias types selected
|
135 |
+
temp_df = pd.DataFrame(columns=BIAS_DF.columns)
|
136 |
+
|
137 |
+
if selected_models and "Model" in temp_df.columns:
|
138 |
+
temp_df = temp_df[temp_df["Model"].isin(selected_models)]
|
139 |
+
elif not selected_models and "Model" in temp_df.columns: # No models selected
|
140 |
+
# If already filtered by bias_type, maintain that, else show empty based on model filter
|
141 |
+
if not selected_bias_types: # If bias types also not selected, then empty
|
142 |
+
temp_df = pd.DataFrame(columns=BIAS_DF.columns)
|
143 |
+
# if selected_bias_types IS populated, then it means we want all models for those bias types
|
144 |
+
# but if selected_models is empty, it means filter to show NO models, hence the following line:
|
145 |
+
elif "Bias_Type" in temp_df.columns and temp_df["Bias_Type"].isin(selected_bias_types).any():
|
146 |
+
temp_df = temp_df[~temp_df["Model"].isin(BIAS_DF["Model"].unique())] # effectively show no models for selected bias types
|
147 |
+
|
148 |
+
valid_selected_cols = [col for col in selected_cols if col in temp_df.columns]
|
149 |
+
if not valid_selected_cols and not temp_df.empty:
|
150 |
+
final_df = temp_df
|
151 |
+
elif not valid_selected_cols and temp_df.empty:
|
152 |
+
final_df = pd.DataFrame(columns=selected_cols)
|
153 |
+
else:
|
154 |
+
final_df = temp_df[valid_selected_cols]
|
155 |
+
|
156 |
+
return gr.DataFrame.update(value=final_df)
|
157 |
+
|
158 |
+
bias_column_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector], outputs=[bias_table_output])
|
159 |
+
bias_type_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector], outputs=[bias_table_output])
|
160 |
+
bias_model_selector.change(fn=update_bias_table, inputs=[bias_column_selector, bias_type_selector, bias_model_selector], outputs=[bias_table_output])
|
161 |
+
|
162 |
+
# The original gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") is replaced by the table and its controls.
|
163 |
+
# If you still want to show LLM_BENCHMARKS_TEXT, you can add it here, e.g.:
|
164 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
165 |
|
166 |
with gr.Row():
|
167 |
with gr.Accordion("π Citation", open=False):
|
bias_evaluation_data.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Bias_Type,Model,CLIP-S,CapS_S,CapS_A,Recall,Noun,Verb,Syn,Sem,CH_s,FS,FS_s,Harm,N-avg
|
2 |
+
Gender bias,MiniGPT-4,0.3,0.9,1.1,7.8,1.7,2.6,6.3,3.2,4.8,6.3,4.0,1.64,0.51
|
3 |
+
Gender bias,InstructBLIP,0.8,2.7,1.2,8.4,1.9,3.3,1.0,0.1,6.8,3.8,5.0,0.72,0.40
|
4 |
+
Gender bias,LLaVA-1.5,0.7,2.2,0.7,9.5,2.2,4.1,1.5,0.2,7.6,3.8,3.7,0.39,0.46
|
5 |
+
Gender bias,mPLUG-Owl2,0.6,2.2,1.2,9.1,2.3,3.5,1.6,0.0,7.2,3.1,5.8,0.33,0.40
|
6 |
+
Gender bias,Qwen2-VL,0.2,0.7,0.5,6.3,0.1,3.6,13.5,2.5,4.4,0.9,5.7,1.77,0.63
|
7 |
+
Skin tone bias,MiniGPT-4,0.8,1.5,0.8,4.8,0.2,2.3,19.4,0.2,2.0,0.9,0.5,0.09,0.55
|
8 |
+
Skin tone bias,InstructBLIP,0.5,1.4,0.2,8.4,1.9,1.1,6.8,0.1,4.0,2.4,1.1,0.09,0.51
|
9 |
+
Skin tone bias,LLaVA-1.5,0.4,1.3,0.7,4.0,0.2,1.0,5.3,0.6,2.7,1.4,1.3,0.18,0.67
|
10 |
+
Skin tone bias,mPLUG-Owl2,0.6,1.9,0.5,5.1,0.8,2.2,7.6,0.4,1.7,0.1,0.4,0.00,0.67
|
11 |
+
Skin tone bias,Qwen2-VL,0.2,1.1,1.5,2.3,0.5,1.3,14.9,2.3,2.7,3.1,1.8,0.09,0.50
|
12 |
+
Language discrepancy,MiniGPT-4,0.8,1.5,3.9,2.3,4.3,5.2,52.2,5.0,5.4,5.6,3.4,0.10,0.40
|
13 |
+
Language discrepancy,InstructBLIP,,,,,,,,,,,,,,
|
14 |
+
Language discrepancy,LLaVA-1.5,0.4,0.8,2.0,1.1,1.1,1.8,11.4,1.8,4.7,2.0,1.6,0.06,0.95
|
15 |
+
Language discrepancy,mPLUG-Owl2,1.4,1.6,4.9,1.5,1.1,3.7,37.5,8.4,17.0,6.3,1.3,0.02,0.57
|
16 |
+
Language discrepancy,Qwen2-VL,0.2,3.6,6.7,1.9,3.9,3.8,90.8,26.2,6.4,7.5,2.1,0.14,0.28
|