WeijianQi1999 commited on
Commit
49268fc
·
1 Parent(s): 1f3e8c7

update 0602

Browse files
app.py CHANGED
@@ -6,6 +6,7 @@ import plotly.graph_objects as go
6
  from plotly.subplots import make_subplots
7
  from collections import Counter
8
  from apscheduler.schedulers.background import BackgroundScheduler
 
9
 
10
  from scorer import question_scorer
11
  from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
@@ -16,22 +17,125 @@ OWNER = "Online-Mind2Web"
16
  YEAR_VERSION = "2025"
17
  LOCAL_DEBUG = True
18
 
 
 
 
 
 
 
 
 
 
19
  def get_dataframe_from_results(eval_path):
20
  df = pd.read_csv(eval_path)
21
- df = df.sort_values(by=["Average SR"], ascending=False)
22
- for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']:
23
- df[format_column] = df[format_column].map('{:.1f}'.format)
 
 
 
 
 
 
 
 
 
 
 
24
  return df
25
 
26
- # auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
27
  auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
28
  human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
29
- TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
30
 
31
- def refresh():
32
- auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
33
- human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
34
- return auto_eval_dataframe_test, human_eval_dataframe_test
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def plot_heatmap_with_performance_bar(json_file):
37
  with open(json_file, "r") as f:
@@ -143,8 +247,14 @@ def plot_heatmap_with_performance_bar(json_file):
143
  )
144
  return fig
145
 
146
- def gradio_plot_wrapper(json_file):
147
- return plot_heatmap_with_performance_bar(json_file.name)
 
 
 
 
 
 
148
 
149
  demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
150
 
@@ -178,12 +288,10 @@ with demo:
178
  gr.Markdown(EVALUATION_DETAILS)
179
 
180
  with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
181
- auto_leaderboard_table_test = gr.Dataframe(
182
- value=auto_eval_dataframe_test,
183
- datatype=TYPES,
184
- interactive=False,
185
- wrap=False
186
- )
187
 
188
  with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
189
  with gr.Row():
@@ -193,7 +301,11 @@ with demo:
193
  refresh_button.click(
194
  refresh,
195
  inputs=[],
196
- outputs=[auto_leaderboard_table_test, human_leaderboard_table_test],
 
 
 
 
197
  )
198
 
199
 
@@ -201,4 +313,4 @@ scheduler = BackgroundScheduler()
201
  scheduler.start()
202
 
203
  if __name__ == "__main__":
204
- demo.launch(debug=True)
 
6
  from plotly.subplots import make_subplots
7
  from collections import Counter
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
+ import numpy as np
10
 
11
  from scorer import question_scorer
12
  from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
 
17
  YEAR_VERSION = "2025"
18
  LOCAL_DEBUG = True
19
 
20
+ def _format_sr_column(series: pd.Series) -> pd.Series:
21
+ numeric = pd.to_numeric(series, errors="coerce")
22
+ out = numeric.map(lambda x: f"{x:.1f}" if pd.notna(x) else "")
23
+
24
+ # Wherever coercion failed (original was str / NaN), restore original value
25
+ mask = numeric.isna() & series.notna()
26
+ out[mask] = series[mask]
27
+ return out
28
+
29
  def get_dataframe_from_results(eval_path):
30
  df = pd.read_csv(eval_path)
31
+
32
+ if "Verified" not in df.columns:
33
+ df = df.sort_values(by=["Average SR"], ascending=False)
34
+ else:
35
+ df = df.sort_values(
36
+ by=["Verified", "Average SR"],
37
+ ascending=[False, False], # False 表示降序;Verified=True 会排到最上面
38
+ kind="mergesort" # 稳定排序,保证次序可预期
39
+ )
40
+
41
+ for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
42
+ if col in df.columns:
43
+ df[col] = _format_sr_column(df[col])
44
+
45
  return df
46
 
 
47
  auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
48
  human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
49
+ TYPES = ["str", "str", "str", "markdown", "number", "number", "number", "number", "str", "str", "markdown", "str"]
50
 
51
+ def df_to_gradio(df: pd.DataFrame) -> dict:
52
+ display_df = df.drop(columns=["Release Time"], errors="ignore")
53
+ headers = display_df.columns.tolist()
54
+ data = display_df.values.tolist()
55
+ # simple styling: red if not verified
56
+ if "Verified" in display_df.columns:
57
+ verified_idx = headers.index("Verified")
58
+ styling = [["background-color:#ffcccc" if not row[verified_idx] else "" for _ in headers] for row in data]
59
+ else:
60
+ styling = [["" for _ in headers] for _ in data]
61
+ return {"data": data, "headers": headers, "metadata": {"styling": styling}}
62
+
63
+ def gradio_plot_wrapper(json_file):
64
+ return plot_heatmap_with_performance_bar(json_file.name)
65
+
66
+ def style_auto_df(df: pd.DataFrame):
67
+ def _row_style(row):
68
+ bg = "background-color: #ffcccc" if row["Verified"] != True else ""
69
+ return [bg] * len(row)
70
+
71
+ styler = df.style.apply(_row_style, axis=1)
72
+ try:
73
+ styler = styler.hide(axis="index")
74
+ except Exception:
75
+ pass
76
+ return styler
77
+
78
+ def nice_bounds(low: float, high: float) -> tuple[float, float]:
79
+ if low == high:
80
+ low -= 1; high += 1
81
+ return (np.floor(low / 5) * 5, np.ceil(high / 5) * 5)
82
+
83
+ def plot_sr_vs_time(df: pd.DataFrame, title: str = "Success rate over time") -> go.Figure:
84
+
85
+ work = df[df["Verified"] == True].copy() # filter out unverified rows
86
+
87
+ work["Release Time"] = pd.to_datetime(work["Release Time"], errors="coerce")
88
+ work["Average SR"] = pd.to_numeric(work["Average SR"], errors="coerce")
89
+ work = work.dropna(subset=["Release Time", "Average SR"])
90
+
91
+ agents = work["Agent"].unique().tolist()
92
+ color_map = {a: f"hsl({int(360*i/len(agents))},70%,45%)" for i, a in enumerate(agents)}
93
+
94
+ fig = go.Figure()
95
+
96
+ y_min_raw, y_max_raw = work["Average SR"].min(), work["Average SR"].max()
97
+ y_min, y_max = nice_bounds(y_min_raw, y_max_raw)
98
+ band_edges = np.linspace(y_min, y_max, 4)
99
+ band_cols = ["rgba(226,247,226,0.35)", "rgba(255,255,204,0.35)", "rgba(255,228,225,0.35)"]
100
+ shapes = [
101
+ dict(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=band_edges[i], y1=band_edges[i+1],
102
+ fillcolor=band_cols[i], line_width=0)
103
+ for i in range(3)
104
+ ]
105
+
106
+ for _, row in work.iterrows():
107
+ fig.add_trace(
108
+ go.Scatter(
109
+ x=[row["Release Time"]],
110
+ y=[row["Average SR"]],
111
+ mode="markers+text",
112
+ text=[row["Agent"]],
113
+ textposition="top center",
114
+ textfont=dict(size=11),
115
+ marker=dict(size=10, color=color_map[row["Agent"]], opacity=0.9),
116
+ hovertemplate="Agent: %{text}<br>SR: %{y:.1f}%<br>Date: %{x|%Y-%m}<extra></extra>",
117
+ showlegend=False,
118
+ )
119
+ )
120
+
121
+ if len(work) >= 2:
122
+ x_numeric = work["Release Time"].map(pd.Timestamp.toordinal)
123
+ slope, intercept = np.polyfit(x_numeric, work["Average SR"], 1)
124
+ x_range = pd.date_range(work["Release Time"].min(), work["Release Time"].max(), freq="MS")
125
+ y_pred = slope * x_range.map(pd.Timestamp.toordinal) + intercept
126
+ fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode="lines", line=dict(color="rgba(0,0,0,0.6)", dash="dash"), name="Trend", hoverinfo="skip"))
127
+
128
+ fig.update_layout(
129
+ title=dict(text=title, x=0.5, xanchor="center", font=dict(size=22)),
130
+ xaxis_title="Release Time",
131
+ yaxis_title="Success Rate",
132
+ template="plotly_white",
133
+ width=1800, height=800,
134
+ shapes=shapes,
135
+ )
136
+ fig.update_xaxes(dtick="M1", tickformat="%Y-%m", showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
137
+ fig.update_yaxes(showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
138
+ return fig
139
 
140
  def plot_heatmap_with_performance_bar(json_file):
141
  with open(json_file, "r") as f:
 
247
  )
248
  return fig
249
 
250
+ def refresh():
251
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
252
+ human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
253
+ sr_time_plot = plot_sr_vs_time(auto_eval_dataframe_test)
254
+ auto_eval_dataframe_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")
255
+
256
+ return auto_eval_dataframe_test, human_eval_dataframe_test, sr_time_plot
257
+
258
 
259
  demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
260
 
 
288
  gr.Markdown(EVALUATION_DETAILS)
289
 
290
  with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
291
+ sr_time_plot = gr.Plot(plot_sr_vs_time(auto_eval_dataframe_test))
292
+ gr.Markdown('### Agents highlighted in red represent unverified results that may involve unreliable evaluations and are provided for reference only. You can refer to the "Note" column for more details.')
293
+ auto_leaderboard_table_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")
294
+
 
 
295
 
296
  with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
297
  with gr.Row():
 
301
  refresh_button.click(
302
  refresh,
303
  inputs=[],
304
+ outputs=[
305
+ auto_leaderboard_table_test,
306
+ human_leaderboard_table_test,
307
+ sr_time_plot
308
+ ],
309
  )
310
 
311
 
 
313
  scheduler.start()
314
 
315
  if __name__ == "__main__":
316
+ demo.launch(debug=True,share=True)
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv CHANGED
@@ -1,7 +1,10 @@
1
- Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
- Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,73.5,59.4,39.2,58.3,2025-5-11
3
- SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,51.8,28,9.5,30,2025-5-11
4
- Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,44.6,23.1,10.8,26,2025-5-11
5
- Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,OSU NLP,51.8,16.1,8.1,24,2025-5-11
6
- Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,51.8,23.1,6.8,27,2025-5-11
7
- Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,75.9,41.3,27,47.3,2025-5-11
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date,Verified,Note,Release Time
2
+ Operator,OpenAI Computer-Using Agent,OpenAI,[OSU NLP](https://arxiv.org/abs/2504.01382),73.5,59.4,39.2,58.3,2025-5-11,True,,2025-01
3
+ SeeAct,gpt-4o-2024-08-06,OSU,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,28,9.5,30,2025-5-11,True,,2024-01
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,[OSU NLP](https://arxiv.org/abs/2504.01382),44.6,23.1,10.8,26,2025-5-11,True,,2025-01
5
+ Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,16.1,8.1,24,2025-5-11,True,,2024-10
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,23.1,6.8,27,2025-5-11,True,,2024-07
7
+ Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,[OSU NLP](https://arxiv.org/abs/2504.01382),75.9,41.3,27,47.3,2025-5-11,True,,2025-02
8
+ Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
9
+ Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
10
+ Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
content.py CHANGED
@@ -9,14 +9,22 @@ LINKS = """
9
  """
10
 
11
  INTRODUCTION_TEXT = """
12
- Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
13
- Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (15 steps), Medium (610 steps), and Hard (11+ steps).
14
  """
15
 
16
  LEADERBOARD_TEXT = """
17
- ### Leaderboard
18
  Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
19
- Please click "Submission Guideline" for details.
 
 
 
 
 
 
 
 
20
  """
21
 
22
  SUBMISSION_TEXT = """
@@ -29,8 +37,8 @@ Submissions must include a sequence of images (i.e., screenshots in the trajecto
29
  EVALUATION_DETAILS = """
30
  In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
31
 
32
- - **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/human_label.json).
33
- - **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results)."""
34
 
35
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data."
36
  CITATION_BUTTON_TEXT = r"""
@@ -57,14 +65,14 @@ CITATION_BUTTON_TEXT = r"""
57
  """
58
 
59
  SUBMIT_INTRODUCTION = """
60
- You should use the script provided in our GitHub repository to obtain automatic evaluation results on your own and submit them along with all trajectories.
61
- To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
62
  If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
63
 
64
- ## Important Notes for Reliable Evaluation:
65
- - To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
66
- - The action history should contain only the actions taken by the agent to complete the task (e.g., clicking elements and Typing text). Please avoid including the final response, as it may contain hallucinated content, leading to a high rate of false positives.
67
- - WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
68
 
69
  ## ⚠ Please submit the trajectory file with the following format:
70
  The result of each task is stored in a folder named as its `task_id`, containing:
 
9
  """
10
 
11
  INTRODUCTION_TEXT = """
12
+ Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains with reliable LLM-as-a-Judge (WebJudge) automactic evaluation.
13
+ Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1-5 steps), Medium (6-10 steps), and Hard (11+ steps).
14
  """
15
 
16
  LEADERBOARD_TEXT = """
17
+ ## Leaderboard
18
  Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
19
+
20
+ When using our benchmark or submitting results, please first carefully review the important notes to ensure proper usage and obtain reliable evaluation results and follow the "Submission Guideline".
21
+
22
+ ### ⚠ Important Notes for Reliable Evaluation:
23
+ - **Start from the specified websites, not Google Search**: To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
24
+ - **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., Clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
25
+ - **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
26
+
27
+ **Please do not use it as training data for your agent.**
28
  """
29
 
30
  SUBMISSION_TEXT = """
 
37
  EVALUATION_DETAILS = """
38
  In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
39
 
40
+ - **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/blob/main/data/evaluation_results/online_mind2web_evaluation_results/human_label.json).
41
+ - **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/online_mind2web_evaluation_results/webjudge_o4-mini)."""
42
 
43
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data."
44
  CITATION_BUTTON_TEXT = r"""
 
65
  """
66
 
67
  SUBMIT_INTRODUCTION = """
68
+ You should use the script provided in our GitHub [repository](https://github.com/OSU-NLP-Group/Online-Mind2Web) to obtain automatic evaluation results on your own and submit them along with all trajectories to enhance transparency.
69
+ To ensure the authenticity and reliability of the reported results, we will also verify the auto-eval results.
70
  If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
71
 
72
+ ## Important Notes for Reliable Evaluation:
73
+ - **Start from the specified websites, not Google Search**:To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
74
+ - **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., Clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
75
+ - **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
76
 
77
  ## ⚠ Please submit the trajectory file with the following format:
78
  The result of each task is stored in a folder named as its `task_id`, containing:
requirements.txt CHANGED
@@ -2,5 +2,4 @@ datasets
2
  gradio
3
  huggingface-hub
4
  numpy
5
- APScheduler
6
- plotly
 
2
  gradio
3
  huggingface-hub
4
  numpy
5
+ APScheduler