import os import gradio as gr import pandas as pd import json import plotly.graph_objects as go from plotly.subplots import make_subplots from collections import Counter from apscheduler.schedulers.background import BackgroundScheduler import numpy as np from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION TOKEN = os.environ.get("TOKEN", None) OWNER = "Online-Mind2Web" YEAR_VERSION = "2025" LOCAL_DEBUG = True def _format_sr_column(series: pd.Series) -> pd.Series: numeric = pd.to_numeric(series, errors="coerce") out = numeric.map(lambda x: f"{x:.1f}" if pd.notna(x) else "") # Wherever coercion failed (original was str / NaN), restore original value mask = numeric.isna() & series.notna() out[mask] = series[mask] return out def get_dataframe_from_results(eval_path): df = pd.read_csv(eval_path) if "Verified" not in df.columns: df = df.sort_values(by=["Average SR"], ascending=False) else: df = df.sort_values( by=["Verified", "Average SR"], ascending=[False, False], kind="mergesort" ) for col in ['Easy', 'Medium', 'Hard', 'Average SR']: if col in df.columns: df[col] = _format_sr_column(df[col]) return df auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') TYPES = ["str", "str", "str", "markdown", "number", "number", "number", "number", "str", "str", "markdown", "str"] def df_to_gradio(df: pd.DataFrame) -> dict: display_df = df.drop(columns=["Release Time"], errors="ignore") headers = display_df.columns.tolist() data = display_df.values.tolist() # simple styling: red if not verified if "Verified" in display_df.columns: verified_idx = headers.index("Verified") styling = [["background-color:#ffcccc" if not row[verified_idx] else "" for _ in headers] for row in data] else: styling = [["" for _ in headers] for _ in data] return {"data": data, "headers": headers, "metadata": {"styling": styling}} def gradio_plot_wrapper(json_file): return plot_heatmap_with_performance_bar(json_file.name) def style_auto_df(df: pd.DataFrame): def _row_style(row): bg = "background-color: #ffcccc" if row["Verified"] != True else "" return [bg] * len(row) styler = df.style.apply(_row_style, axis=1) try: styler = styler.hide(axis="index") except Exception: pass return styler def nice_bounds(low: float, high: float) -> tuple[float, float]: if low == high: low -= 1; high += 1 return (np.floor(low / 5) * 5, np.ceil(high / 5) * 5) def plot_sr_vs_time(df: pd.DataFrame, title: str = "Success rate over time") -> go.Figure: work = df[df["Verified"] == True].copy() # filter out unverified rows work["Release Time"] = pd.to_datetime(work["Release Time"], errors="coerce") work["Average SR"] = pd.to_numeric(work["Average SR"], errors="coerce") work = work.dropna(subset=["Release Time", "Average SR"]) agents = work["Agent"].unique().tolist() color_map = {a: f"hsl({int(360*i/len(agents))},70%,45%)" for i, a in enumerate(agents)} fig = go.Figure() y_min_raw, y_max_raw = work["Average SR"].min(), work["Average SR"].max() y_min, y_max = nice_bounds(y_min_raw, y_max_raw) band_edges = np.linspace(y_min, y_max, 4) band_cols = ["rgba(226,247,226,0.35)", "rgba(255,255,204,0.35)", "rgba(255,228,225,0.35)"] shapes = [ dict(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=band_edges[i], y1=band_edges[i+1], fillcolor=band_cols[i], line_width=0) for i in range(3) ] for _, row in work.iterrows(): fig.add_trace( go.Scatter( x=[row["Release Time"]], y=[row["Average SR"]], mode="markers+text", text=[row["Agent"]], textposition="top center", textfont=dict(size=11), marker=dict(size=10, color=color_map[row["Agent"]], opacity=0.9), hovertemplate="Agent: %{text}
SR: %{y:.1f}%
Date: %{x|%Y-%m}", showlegend=False, ) ) if len(work) >= 2: x_numeric = work["Release Time"].map(pd.Timestamp.toordinal) slope, intercept = np.polyfit(x_numeric, work["Average SR"], 1) x_range = pd.date_range(work["Release Time"].min(), work["Release Time"].max(), freq="MS") y_pred = slope * x_range.map(pd.Timestamp.toordinal) + intercept fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode="lines", line=dict(color="rgba(0,0,0,0.6)", dash="dash"), name="Trend", hoverinfo="skip")) fig.update_layout( title=dict(text=title, x=0.5, xanchor="center", font=dict(size=22)), xaxis_title="Release Time", yaxis_title="Success Rate", template="plotly_white", width=1800, height=800, shapes=shapes, ) fig.update_xaxes(dtick="M1", tickformat="%Y-%m", showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot") fig.update_yaxes(showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot") return fig def plot_heatmap_with_performance_bar(json_file): with open(json_file, "r") as f: data = json.load(f) agents = [k for k in data[0].keys() if k.endswith("_human_label")] records = [] original_ids = [task["task_id"] for task in data] for task in data: task_id = task["task_id"] for agent in agents: raw_val = task.get(agent, "0") try: val = int(raw_val) except ValueError: val = 0 val = 1 if val == 1 else 0 records.append({ "Task ID": task_id, "Agent": agent.replace("_human_label", ""), "Success": val }) df = pd.DataFrame(records) pivot = df.pivot_table(index="Agent", columns="Task ID", values="Success", aggfunc="max") for task_id in original_ids: if task_id not in pivot.columns: pivot[task_id] = 0 pivot = pivot[original_ids] agent_success_rate = pivot.sum(axis=1) / pivot.shape[1] pivot["SuccessRate"] = agent_success_rate pivot = pivot.sort_values(by="SuccessRate", ascending=False) pivot = pivot.drop(columns=["SuccessRate"]) agent_name_map = { "Operator": "Operator", "Agent-E": "Agent-E", "Browser_Use": "Browser Use", "Claude_Computer_Use": "Claude Computer Use", "SeeAct": "SeeAct" } sorted_agents = pivot.index.tolist() pivot.index = [ f"{agent_name_map.get(agent, agent)} ({agent_success_rate[agent]*100:.1f}%)" for agent in sorted_agents ] custom_labels = [["Success" if val == 1 else "Failure" for val in row] for row in pivot.values] any_agent_solved = pivot.max(axis=0).sum() best_agent_solved = pivot.sum(axis=1).max() total_tasks = len(original_ids) fig = make_subplots( rows=2, cols=1, row_heights=[0.8, 0.2], vertical_spacing=0.08, subplot_titles=("TASK ID", ""), shared_xaxes=False ) fig.add_trace(go.Heatmap( z=pivot.values, x=pivot.columns, y=pivot.index, colorscale=[[0, "white"], [1, "skyblue"]], zmin=0, zmax=1, showscale=False, customdata=custom_labels, hovertemplate="Agent: %{y}
Task ID: %{x}
Completion: %{customdata}" ), row=1, col=1) fig.add_trace(go.Bar( y=["Any agent", "Best agent"], x=[any_agent_solved, best_agent_solved], orientation='h', marker_color=["dodgerblue", "mediumseagreen"], text=[ f"{int(any_agent_solved)}/{total_tasks} ({any_agent_solved / total_tasks:.1%})", f"{int(best_agent_solved)}/{total_tasks} ({best_agent_solved / total_tasks:.1%})" ], textposition="auto", showlegend=False ), row=2, col=1) fig.add_trace(go.Scatter( x=[None], y=[None], mode='markers', marker=dict(size=10, color='skyblue'), name='Success' )) fig.add_trace(go.Scatter( x=[None], y=[None], mode='markers', marker=dict(size=10, color='white', line=dict(width=1, color='black')), name='Failure' )) fig.update_xaxes(range=[0, total_tasks], row=2, col=1) fig.update_layout( height=600, xaxis=dict(showticklabels=False), yaxis=dict(title="Agent"), yaxis2=dict(title=""), margin=dict(t=60) ) return fig def refresh(): auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') sr_time_plot = plot_sr_vs_time(auto_eval_dataframe_test) auto_eval_dataframe_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table") return auto_eval_dataframe_test, human_eval_dataframe_test, sr_time_plot demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""") with demo: gr.HTML(TITLE) gr.HTML(LINKS) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", lines=10, ) gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text") with gr.Tab("Human Evaluation", elem_id="human-tab", id=1): human_leaderboard_table_test = gr.Dataframe( value=human_eval_dataframe_test, datatype=TYPES, interactive=False, wrap=False ) gr.Markdown("### Visualization") gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)") fig = plot_heatmap_with_performance_bar("./human_label.json") gr.Plot(fig) gr.Markdown(EVALUATION_DETAILS) with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2): sr_time_plot = gr.Plot(plot_sr_vs_time(auto_eval_dataframe_test)) gr.Markdown('### Agents highlighted in red represent unverified results that may involve unreliable evaluations and are provided for reference only. You can refer to the "Note" column for more details.') auto_leaderboard_table_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table") with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3): with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ auto_leaderboard_table_test, human_leaderboard_table_test, sr_time_plot ], ) scheduler = BackgroundScheduler() scheduler.start() if __name__ == "__main__": demo.launch(debug=True,share=True)