Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

App Files Files Community

WeijianQi1999 commited on 3 days ago

Commit

49268fc

1 Parent(s): 1f3e8c7

update 0602

Browse files

Files changed (4) hide show

app.py +131 -19
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv +10 -7
content.py +20 -12
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from collections import Counter
 from apscheduler.schedulers.background import BackgroundScheduler
 from scorer import question_scorer
 from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
@@ -16,22 +17,125 @@ OWNER = "Online-Mind2Web"
 YEAR_VERSION = "2025"
 LOCAL_DEBUG = True
 def get_dataframe_from_results(eval_path):
     df = pd.read_csv(eval_path)
-    df = df.sort_values(by=["Average SR"], ascending=False)
-    for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
-        df[format_column] = df[format_column].map('{:.1f}'.format)
     return df
-# auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
 auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
 human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
-TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
-def refresh():
-    auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
-    human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
-    return auto_eval_dataframe_test, human_eval_dataframe_test
 def plot_heatmap_with_performance_bar(json_file):
     with open(json_file, "r") as f:
@@ -143,8 +247,14 @@ def plot_heatmap_with_performance_bar(json_file):
     )
     return fig
-def gradio_plot_wrapper(json_file):
-    return plot_heatmap_with_performance_bar(json_file.name)
 demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
@@ -178,12 +288,10 @@ with demo:
         gr.Markdown(EVALUATION_DETAILS)
     with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
-        auto_leaderboard_table_test = gr.Dataframe(
-            value=auto_eval_dataframe_test,
-            datatype=TYPES,
-            interactive=False,
-            wrap=False
-        )
     with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
         with gr.Row():
@@ -193,7 +301,11 @@ with demo:
     refresh_button.click(
         refresh,
         inputs=[],
-        outputs=[auto_leaderboard_table_test, human_leaderboard_table_test],
     )
@@ -201,4 +313,4 @@ scheduler = BackgroundScheduler()
 scheduler.start()
 if __name__ == "__main__":
-    demo.launch(debug=True)

 from plotly.subplots import make_subplots
 from collections import Counter
 from apscheduler.schedulers.background import BackgroundScheduler
+import numpy as np
 from scorer import question_scorer
 from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
 YEAR_VERSION = "2025"
 LOCAL_DEBUG = True
+def _format_sr_column(series: pd.Series) -> pd.Series:
+    numeric = pd.to_numeric(series, errors="coerce")
+    out = numeric.map(lambda x: f"{x:.1f}" if pd.notna(x) else "")
+    # Wherever coercion failed (original was str / NaN), restore original value
+    mask = numeric.isna() & series.notna()
+    out[mask] = series[mask]
+    return out
 def get_dataframe_from_results(eval_path):
     df = pd.read_csv(eval_path)
+    if "Verified" not in df.columns:
+        df = df.sort_values(by=["Average SR"], ascending=False)
+    else:
+        df = df.sort_values(
+        by=["Verified", "Average SR"],
+        ascending=[False, False],      # False 表示降序；Verified=True 会排到最上面
+        kind="mergesort"              # 稳定排序，保证次序可预期
+    )
+    for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
+        if col in df.columns:
+            df[col] = _format_sr_column(df[col])
     return df
 auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
 human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
+TYPES = ["str", "str", "str", "markdown", "number", "number", "number", "number", "str", "str", "markdown", "str"]
+def df_to_gradio(df: pd.DataFrame) -> dict:
+    display_df = df.drop(columns=["Release Time"], errors="ignore")
+    headers = display_df.columns.tolist()
+    data = display_df.values.tolist()
+    # simple styling: red if not verified
+    if "Verified" in display_df.columns:
+        verified_idx = headers.index("Verified")
+        styling = [["background-color:#ffcccc" if not row[verified_idx] else "" for _ in headers] for row in data]
+    else:
+        styling = [["" for _ in headers] for _ in data]
+    return {"data": data, "headers": headers, "metadata": {"styling": styling}}
+def gradio_plot_wrapper(json_file):
+    return plot_heatmap_with_performance_bar(json_file.name)
+def style_auto_df(df: pd.DataFrame):
+    def _row_style(row):
+        bg = "background-color: #ffcccc" if row["Verified"] != True else ""
+        return [bg] * len(row)
+    styler = df.style.apply(_row_style, axis=1)
+    try:
+        styler = styler.hide(axis="index")
+    except Exception:
+        pass
+    return styler
+def nice_bounds(low: float, high: float) -> tuple[float, float]:
+    if low == high:
+        low -= 1; high += 1
+    return (np.floor(low / 5) * 5, np.ceil(high / 5) * 5)
+def plot_sr_vs_time(df: pd.DataFrame, title: str = "Success rate over time") -> go.Figure:
+    work = df[df["Verified"] == True].copy()  # filter out unverified rows
+    work["Release Time"] = pd.to_datetime(work["Release Time"], errors="coerce")
+    work["Average SR"] = pd.to_numeric(work["Average SR"], errors="coerce")
+    work = work.dropna(subset=["Release Time", "Average SR"])
+    agents = work["Agent"].unique().tolist()
+    color_map = {a: f"hsl({int(360*i/len(agents))},70%,45%)" for i, a in enumerate(agents)}
+    fig = go.Figure()
+    y_min_raw, y_max_raw = work["Average SR"].min(), work["Average SR"].max()
+    y_min, y_max = nice_bounds(y_min_raw, y_max_raw)
+    band_edges = np.linspace(y_min, y_max, 4)
+    band_cols = ["rgba(226,247,226,0.35)", "rgba(255,255,204,0.35)", "rgba(255,228,225,0.35)"]
+    shapes = [
+        dict(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=band_edges[i], y1=band_edges[i+1],
+             fillcolor=band_cols[i], line_width=0)
+        for i in range(3)
+    ]
+    for _, row in work.iterrows():
+        fig.add_trace(
+            go.Scatter(
+                x=[row["Release Time"]],
+                y=[row["Average SR"]],
+                mode="markers+text",
+                text=[row["Agent"]],
+                textposition="top center",
+                textfont=dict(size=11),
+                marker=dict(size=10, color=color_map[row["Agent"]], opacity=0.9),
+                hovertemplate="Agent: %{text}<br>SR: %{y:.1f}%<br>Date: %{x|%Y-%m}<extra></extra>",
+                showlegend=False,
+            )
+        )
+    if len(work) >= 2:
+        x_numeric = work["Release Time"].map(pd.Timestamp.toordinal)
+        slope, intercept = np.polyfit(x_numeric, work["Average SR"], 1)
+        x_range = pd.date_range(work["Release Time"].min(), work["Release Time"].max(), freq="MS")
+        y_pred = slope * x_range.map(pd.Timestamp.toordinal) + intercept
+        fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode="lines", line=dict(color="rgba(0,0,0,0.6)", dash="dash"), name="Trend", hoverinfo="skip"))
+    fig.update_layout(
+                    title=dict(text=title, x=0.5, xanchor="center", font=dict(size=22)),
+                    xaxis_title="Release Time",
+                    yaxis_title="Success Rate",
+                    template="plotly_white",
+                    width=1800, height=800,
+                    shapes=shapes,
+                    )
+    fig.update_xaxes(dtick="M1", tickformat="%Y-%m", showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
+    fig.update_yaxes(showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
+    return fig
 def plot_heatmap_with_performance_bar(json_file):
     with open(json_file, "r") as f:
     )
     return fig
+def refresh():
+    auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
+    human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
+    sr_time_plot = plot_sr_vs_time(auto_eval_dataframe_test)
+    auto_eval_dataframe_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")
+    return auto_eval_dataframe_test, human_eval_dataframe_test, sr_time_plot
 demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
         gr.Markdown(EVALUATION_DETAILS)
     with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
+        sr_time_plot = gr.Plot(plot_sr_vs_time(auto_eval_dataframe_test))
+        gr.Markdown('### Agents highlighted in red represent unverified results that may involve unreliable evaluations and are provided for reference only. You can refer to the "Note" column for more details.')
+        auto_leaderboard_table_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")
     with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
         with gr.Row():
     refresh_button.click(
         refresh,
         inputs=[],
+        outputs=[
+            auto_leaderboard_table_test,
+            human_leaderboard_table_test,
+            sr_time_plot
+        ],
     )
 scheduler.start()
 if __name__ == "__main__":
+    demo.launch(debug=True,share=True)

auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv CHANGED Viewed

@@ -1,7 +1,10 @@
-Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
-Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,73.5,59.4,39.2,58.3,2025-5-11
-SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,51.8,28,9.5,30,2025-5-11
-Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,44.6,23.1,10.8,26,2025-5-11
-Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,OSU NLP,51.8,16.1,8.1,24,2025-5-11
-Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,51.8,23.1,6.8,27,2025-5-11
-Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,75.9,41.3,27,47.3,2025-5-11

+Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date,Verified,Note,Release Time
+Operator,OpenAI Computer-Using Agent,OpenAI,[OSU NLP](https://arxiv.org/abs/2504.01382),73.5,59.4,39.2,58.3,2025-5-11,True,,2025-01
+SeeAct,gpt-4o-2024-08-06,OSU,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,28,9.5,30,2025-5-11,True,,2024-01
+Browser Use,gpt-4o-2024-08-06,Browser Use,[OSU NLP](https://arxiv.org/abs/2504.01382),44.6,23.1,10.8,26,2025-5-11,True,,2025-01
+Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,16.1,8.1,24,2025-5-11,True,,2024-10
+Agent-E,gpt-4o-2024-08-06,Emergence AI,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,23.1,6.8,27,2025-5-11,True,,2024-07
+Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,[OSU NLP](https://arxiv.org/abs/2504.01382),75.9,41.3,27,47.3,2025-5-11,True,,2025-02
+Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
+Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
+Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05

content.py CHANGED Viewed

@@ -9,14 +9,22 @@ LINKS = """
 """
 INTRODUCTION_TEXT = """
-Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
-Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
 """
 LEADERBOARD_TEXT = """
-### Leaderboard
 Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
-Please click "Submission Guideline" for details.
 """
 SUBMISSION_TEXT = """
@@ -29,8 +37,8 @@ Submissions must include a sequence of images (i.e., screenshots in the trajecto
 EVALUATION_DETAILS = """
 In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
-- **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/human_label.json).
-- **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results)."""
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data."
 CITATION_BUTTON_TEXT = r"""
@@ -57,14 +65,14 @@ CITATION_BUTTON_TEXT = r"""
 """
 SUBMIT_INTRODUCTION = """
-You should use the script provided in our GitHub repository to obtain automatic evaluation results on your own and submit them along with all trajectories.
-To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
 If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
-## Important Notes for Reliable Evaluation:
-- To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
-- The action history should contain only the actions taken by the agent to complete the task (e.g., clicking elements and Typing text). Please avoid including the final response, as it may contain hallucinated content, leading to a high rate of false positives.
-- WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
 ## ⚠ Please submit the trajectory file with the following format:
 The result of each task is stored in a folder named as its `task_id`, containing:

 """
 INTRODUCTION_TEXT = """
+Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains with reliable LLM-as-a-Judge (WebJudge) automactic evaluation.
+Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1-5 steps), Medium (6-10 steps), and Hard (11+ steps).
 """
 LEADERBOARD_TEXT = """
+## Leaderboard
 Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
+When using our benchmark or submitting results, please first carefully review the important notes to ensure proper usage and obtain reliable evaluation results and follow the "Submission Guideline".
+### ⚠ Important Notes for Reliable Evaluation:
+- **Start from the specified websites, not Google Search**: To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
+- **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., Clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
+- **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
+**Please do not use it as training data for your agent.**
 """
 SUBMISSION_TEXT = """
 EVALUATION_DETAILS = """
 In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
+- **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/blob/main/data/evaluation_results/online_mind2web_evaluation_results/human_label.json).
+- **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/online_mind2web_evaluation_results/webjudge_o4-mini)."""
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data."
 CITATION_BUTTON_TEXT = r"""
 """
 SUBMIT_INTRODUCTION = """
+You should use the script provided in our GitHub [repository](https://github.com/OSU-NLP-Group/Online-Mind2Web) to obtain automatic evaluation results on your own and submit them along with all trajectories to enhance transparency.
+To ensure the authenticity and reliability of the reported results, we will also verify the auto-eval results.
 If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
+## ⚠ Important Notes for Reliable Evaluation:
+- **Start from the specified websites, not Google Search**:To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
+- **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., Clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
+- **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
 ## ⚠ Please submit the trajectory file with the following format:
 The result of each task is stored in a folder named as its `task_id`, containing:

requirements.txt CHANGED Viewed

@@ -2,5 +2,4 @@ datasets
 gradio
 huggingface-hub
 numpy
-APScheduler
-plotly

 gradio
 huggingface-hub
 numpy
+APScheduler