Commit
·
49268fc
1
Parent(s):
1f3e8c7
update 0602
Browse files- app.py +131 -19
- auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv +10 -7
- content.py +20 -12
- requirements.txt +1 -2
app.py
CHANGED
@@ -6,6 +6,7 @@ import plotly.graph_objects as go
|
|
6 |
from plotly.subplots import make_subplots
|
7 |
from collections import Counter
|
8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
9 |
|
10 |
from scorer import question_scorer
|
11 |
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
|
@@ -16,22 +17,125 @@ OWNER = "Online-Mind2Web"
|
|
16 |
YEAR_VERSION = "2025"
|
17 |
LOCAL_DEBUG = True
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def get_dataframe_from_results(eval_path):
|
20 |
df = pd.read_csv(eval_path)
|
21 |
-
|
22 |
-
|
23 |
-
df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
return df
|
25 |
|
26 |
-
# auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
27 |
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
28 |
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
29 |
-
TYPES = ["str", "str", "str", "
|
30 |
|
31 |
-
def
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
def plot_heatmap_with_performance_bar(json_file):
|
37 |
with open(json_file, "r") as f:
|
@@ -143,8 +247,14 @@ def plot_heatmap_with_performance_bar(json_file):
|
|
143 |
)
|
144 |
return fig
|
145 |
|
146 |
-
def
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
|
150 |
|
@@ -178,12 +288,10 @@ with demo:
|
|
178 |
gr.Markdown(EVALUATION_DETAILS)
|
179 |
|
180 |
with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
wrap=False
|
186 |
-
)
|
187 |
|
188 |
with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
|
189 |
with gr.Row():
|
@@ -193,7 +301,11 @@ with demo:
|
|
193 |
refresh_button.click(
|
194 |
refresh,
|
195 |
inputs=[],
|
196 |
-
outputs=[
|
|
|
|
|
|
|
|
|
197 |
)
|
198 |
|
199 |
|
@@ -201,4 +313,4 @@ scheduler = BackgroundScheduler()
|
|
201 |
scheduler.start()
|
202 |
|
203 |
if __name__ == "__main__":
|
204 |
-
demo.launch(debug=True)
|
|
|
6 |
from plotly.subplots import make_subplots
|
7 |
from collections import Counter
|
8 |
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
+
import numpy as np
|
10 |
|
11 |
from scorer import question_scorer
|
12 |
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
|
|
|
17 |
YEAR_VERSION = "2025"
|
18 |
LOCAL_DEBUG = True
|
19 |
|
20 |
+
def _format_sr_column(series: pd.Series) -> pd.Series:
|
21 |
+
numeric = pd.to_numeric(series, errors="coerce")
|
22 |
+
out = numeric.map(lambda x: f"{x:.1f}" if pd.notna(x) else "")
|
23 |
+
|
24 |
+
# Wherever coercion failed (original was str / NaN), restore original value
|
25 |
+
mask = numeric.isna() & series.notna()
|
26 |
+
out[mask] = series[mask]
|
27 |
+
return out
|
28 |
+
|
29 |
def get_dataframe_from_results(eval_path):
|
30 |
df = pd.read_csv(eval_path)
|
31 |
+
|
32 |
+
if "Verified" not in df.columns:
|
33 |
+
df = df.sort_values(by=["Average SR"], ascending=False)
|
34 |
+
else:
|
35 |
+
df = df.sort_values(
|
36 |
+
by=["Verified", "Average SR"],
|
37 |
+
ascending=[False, False], # False 表示降序;Verified=True 会排到最上面
|
38 |
+
kind="mergesort" # 稳定排序,保证次序可预期
|
39 |
+
)
|
40 |
+
|
41 |
+
for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
42 |
+
if col in df.columns:
|
43 |
+
df[col] = _format_sr_column(df[col])
|
44 |
+
|
45 |
return df
|
46 |
|
|
|
47 |
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
48 |
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
49 |
+
TYPES = ["str", "str", "str", "markdown", "number", "number", "number", "number", "str", "str", "markdown", "str"]
|
50 |
|
51 |
+
def df_to_gradio(df: pd.DataFrame) -> dict:
|
52 |
+
display_df = df.drop(columns=["Release Time"], errors="ignore")
|
53 |
+
headers = display_df.columns.tolist()
|
54 |
+
data = display_df.values.tolist()
|
55 |
+
# simple styling: red if not verified
|
56 |
+
if "Verified" in display_df.columns:
|
57 |
+
verified_idx = headers.index("Verified")
|
58 |
+
styling = [["background-color:#ffcccc" if not row[verified_idx] else "" for _ in headers] for row in data]
|
59 |
+
else:
|
60 |
+
styling = [["" for _ in headers] for _ in data]
|
61 |
+
return {"data": data, "headers": headers, "metadata": {"styling": styling}}
|
62 |
+
|
63 |
+
def gradio_plot_wrapper(json_file):
|
64 |
+
return plot_heatmap_with_performance_bar(json_file.name)
|
65 |
+
|
66 |
+
def style_auto_df(df: pd.DataFrame):
|
67 |
+
def _row_style(row):
|
68 |
+
bg = "background-color: #ffcccc" if row["Verified"] != True else ""
|
69 |
+
return [bg] * len(row)
|
70 |
+
|
71 |
+
styler = df.style.apply(_row_style, axis=1)
|
72 |
+
try:
|
73 |
+
styler = styler.hide(axis="index")
|
74 |
+
except Exception:
|
75 |
+
pass
|
76 |
+
return styler
|
77 |
+
|
78 |
+
def nice_bounds(low: float, high: float) -> tuple[float, float]:
|
79 |
+
if low == high:
|
80 |
+
low -= 1; high += 1
|
81 |
+
return (np.floor(low / 5) * 5, np.ceil(high / 5) * 5)
|
82 |
+
|
83 |
+
def plot_sr_vs_time(df: pd.DataFrame, title: str = "Success rate over time") -> go.Figure:
|
84 |
+
|
85 |
+
work = df[df["Verified"] == True].copy() # filter out unverified rows
|
86 |
+
|
87 |
+
work["Release Time"] = pd.to_datetime(work["Release Time"], errors="coerce")
|
88 |
+
work["Average SR"] = pd.to_numeric(work["Average SR"], errors="coerce")
|
89 |
+
work = work.dropna(subset=["Release Time", "Average SR"])
|
90 |
+
|
91 |
+
agents = work["Agent"].unique().tolist()
|
92 |
+
color_map = {a: f"hsl({int(360*i/len(agents))},70%,45%)" for i, a in enumerate(agents)}
|
93 |
+
|
94 |
+
fig = go.Figure()
|
95 |
+
|
96 |
+
y_min_raw, y_max_raw = work["Average SR"].min(), work["Average SR"].max()
|
97 |
+
y_min, y_max = nice_bounds(y_min_raw, y_max_raw)
|
98 |
+
band_edges = np.linspace(y_min, y_max, 4)
|
99 |
+
band_cols = ["rgba(226,247,226,0.35)", "rgba(255,255,204,0.35)", "rgba(255,228,225,0.35)"]
|
100 |
+
shapes = [
|
101 |
+
dict(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=band_edges[i], y1=band_edges[i+1],
|
102 |
+
fillcolor=band_cols[i], line_width=0)
|
103 |
+
for i in range(3)
|
104 |
+
]
|
105 |
+
|
106 |
+
for _, row in work.iterrows():
|
107 |
+
fig.add_trace(
|
108 |
+
go.Scatter(
|
109 |
+
x=[row["Release Time"]],
|
110 |
+
y=[row["Average SR"]],
|
111 |
+
mode="markers+text",
|
112 |
+
text=[row["Agent"]],
|
113 |
+
textposition="top center",
|
114 |
+
textfont=dict(size=11),
|
115 |
+
marker=dict(size=10, color=color_map[row["Agent"]], opacity=0.9),
|
116 |
+
hovertemplate="Agent: %{text}<br>SR: %{y:.1f}%<br>Date: %{x|%Y-%m}<extra></extra>",
|
117 |
+
showlegend=False,
|
118 |
+
)
|
119 |
+
)
|
120 |
+
|
121 |
+
if len(work) >= 2:
|
122 |
+
x_numeric = work["Release Time"].map(pd.Timestamp.toordinal)
|
123 |
+
slope, intercept = np.polyfit(x_numeric, work["Average SR"], 1)
|
124 |
+
x_range = pd.date_range(work["Release Time"].min(), work["Release Time"].max(), freq="MS")
|
125 |
+
y_pred = slope * x_range.map(pd.Timestamp.toordinal) + intercept
|
126 |
+
fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode="lines", line=dict(color="rgba(0,0,0,0.6)", dash="dash"), name="Trend", hoverinfo="skip"))
|
127 |
+
|
128 |
+
fig.update_layout(
|
129 |
+
title=dict(text=title, x=0.5, xanchor="center", font=dict(size=22)),
|
130 |
+
xaxis_title="Release Time",
|
131 |
+
yaxis_title="Success Rate",
|
132 |
+
template="plotly_white",
|
133 |
+
width=1800, height=800,
|
134 |
+
shapes=shapes,
|
135 |
+
)
|
136 |
+
fig.update_xaxes(dtick="M1", tickformat="%Y-%m", showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
|
137 |
+
fig.update_yaxes(showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
|
138 |
+
return fig
|
139 |
|
140 |
def plot_heatmap_with_performance_bar(json_file):
|
141 |
with open(json_file, "r") as f:
|
|
|
247 |
)
|
248 |
return fig
|
249 |
|
250 |
+
def refresh():
|
251 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
252 |
+
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
253 |
+
sr_time_plot = plot_sr_vs_time(auto_eval_dataframe_test)
|
254 |
+
auto_eval_dataframe_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")
|
255 |
+
|
256 |
+
return auto_eval_dataframe_test, human_eval_dataframe_test, sr_time_plot
|
257 |
+
|
258 |
|
259 |
demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")
|
260 |
|
|
|
288 |
gr.Markdown(EVALUATION_DETAILS)
|
289 |
|
290 |
with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
|
291 |
+
sr_time_plot = gr.Plot(plot_sr_vs_time(auto_eval_dataframe_test))
|
292 |
+
gr.Markdown('### Agents highlighted in red represent unverified results that may involve unreliable evaluations and are provided for reference only. You can refer to the "Note" column for more details.')
|
293 |
+
auto_leaderboard_table_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")
|
294 |
+
|
|
|
|
|
295 |
|
296 |
with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
|
297 |
with gr.Row():
|
|
|
301 |
refresh_button.click(
|
302 |
refresh,
|
303 |
inputs=[],
|
304 |
+
outputs=[
|
305 |
+
auto_leaderboard_table_test,
|
306 |
+
human_leaderboard_table_test,
|
307 |
+
sr_time_plot
|
308 |
+
],
|
309 |
)
|
310 |
|
311 |
|
|
|
313 |
scheduler.start()
|
314 |
|
315 |
if __name__ == "__main__":
|
316 |
+
demo.launch(debug=True,share=True)
|
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
-
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
-
Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,73.5,59.4,39.2,58.3,2025-5-11
|
3 |
-
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,51.8,28,9.5,30,2025-5-11
|
4 |
-
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,44.6,23.1,10.8,26,2025-5-11
|
5 |
-
Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,OSU NLP,51.8,16.1,8.1,24,2025-5-11
|
6 |
-
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,51.8,23.1,6.8,27,2025-5-11
|
7 |
-
Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,75.9,41.3,27,47.3,2025-5-11
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date,Verified,Note,Release Time
|
2 |
+
Operator,OpenAI Computer-Using Agent,OpenAI,[OSU NLP](https://arxiv.org/abs/2504.01382),73.5,59.4,39.2,58.3,2025-5-11,True,,2025-01
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,28,9.5,30,2025-5-11,True,,2024-01
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,[OSU NLP](https://arxiv.org/abs/2504.01382),44.6,23.1,10.8,26,2025-5-11,True,,2025-01
|
5 |
+
Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,16.1,8.1,24,2025-5-11,True,,2024-10
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,[OSU NLP](https://arxiv.org/abs/2504.01382),51.8,23.1,6.8,27,2025-5-11,True,,2024-07
|
7 |
+
Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,[OSU NLP](https://arxiv.org/abs/2504.01382),75.9,41.3,27,47.3,2025-5-11,True,,2025-02
|
8 |
+
Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
|
9 |
+
Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
|
10 |
+
Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
|
content.py
CHANGED
@@ -9,14 +9,22 @@ LINKS = """
|
|
9 |
"""
|
10 |
|
11 |
INTRODUCTION_TEXT = """
|
12 |
-
Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
|
13 |
-
Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1
|
14 |
"""
|
15 |
|
16 |
LEADERBOARD_TEXT = """
|
17 |
-
|
18 |
Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
"""
|
21 |
|
22 |
SUBMISSION_TEXT = """
|
@@ -29,8 +37,8 @@ Submissions must include a sequence of images (i.e., screenshots in the trajecto
|
|
29 |
EVALUATION_DETAILS = """
|
30 |
In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
|
31 |
|
32 |
-
- **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/
|
33 |
-
- **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results)."""
|
34 |
|
35 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data."
|
36 |
CITATION_BUTTON_TEXT = r"""
|
@@ -57,14 +65,14 @@ CITATION_BUTTON_TEXT = r"""
|
|
57 |
"""
|
58 |
|
59 |
SUBMIT_INTRODUCTION = """
|
60 |
-
You should use the script provided in our GitHub repository to obtain automatic evaluation results on your own and submit them along with all trajectories.
|
61 |
-
To ensure the authenticity and reliability of the reported results, we will also
|
62 |
If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
|
63 |
|
64 |
-
## Important Notes for Reliable Evaluation:
|
65 |
-
- To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
|
66 |
-
- The action history should contain only the actions taken by the agent to complete the task (e.g.,
|
67 |
-
- WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
|
68 |
|
69 |
## ⚠ Please submit the trajectory file with the following format:
|
70 |
The result of each task is stored in a folder named as its `task_id`, containing:
|
|
|
9 |
"""
|
10 |
|
11 |
INTRODUCTION_TEXT = """
|
12 |
+
Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains with reliable LLM-as-a-Judge (WebJudge) automactic evaluation.
|
13 |
+
Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1-5 steps), Medium (6-10 steps), and Hard (11+ steps).
|
14 |
"""
|
15 |
|
16 |
LEADERBOARD_TEXT = """
|
17 |
+
## Leaderboard
|
18 |
Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboards—one for automatic evaluation and another for human evaluation.
|
19 |
+
|
20 |
+
When using our benchmark or submitting results, please first carefully review the important notes to ensure proper usage and obtain reliable evaluation results and follow the "Submission Guideline".
|
21 |
+
|
22 |
+
### ⚠ Important Notes for Reliable Evaluation:
|
23 |
+
- **Start from the specified websites, not Google Search**: To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
|
24 |
+
- **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., Clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
|
25 |
+
- **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
|
26 |
+
|
27 |
+
**Please do not use it as training data for your agent.**
|
28 |
"""
|
29 |
|
30 |
SUBMISSION_TEXT = """
|
|
|
37 |
EVALUATION_DETAILS = """
|
38 |
In certain scenarios, testing on the full Online-Mind2Web dataset may not be feasible due to cost, privacy, or legal constraints. To facilitate fair and apple-to-apple comparisons, we release both our human evaluation labels and auto-eval details.
|
39 |
|
40 |
+
- **Human Evaluation**: Task-level human evaluation labels are provided in the [file](https://github.com/OSU-NLP-Group/Online-Mind2Web/blob/main/data/evaluation_results/online_mind2web_evaluation_results/human_label.json).
|
41 |
+
- **Auto-Evaluation**: The results of WebJudge are available in the [folder](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/evaluation_results/online_mind2web_evaluation_results/webjudge_o4-mini)."""
|
42 |
|
43 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data."
|
44 |
CITATION_BUTTON_TEXT = r"""
|
|
|
65 |
"""
|
66 |
|
67 |
SUBMIT_INTRODUCTION = """
|
68 |
+
You should use the script provided in our GitHub [repository](https://github.com/OSU-NLP-Group/Online-Mind2Web) to obtain automatic evaluation results on your own and submit them along with all trajectories to enhance transparency.
|
69 |
+
To ensure the authenticity and reliability of the reported results, we will also verify the auto-eval results.
|
70 |
If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
|
71 |
|
72 |
+
## ⚠ Important Notes for Reliable Evaluation:
|
73 |
+
- **Start from the specified websites, not Google Search**:To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
|
74 |
+
- **Include only factual actions, not agent outputs**: The action history should contain only the factual actions taken by the agent to complete the task (e.g., Clicking elements and Typing text). Do not include the final response or any other agent's outputs, as they may contain hallucinated content and result in a high rate of false positives.
|
75 |
+
- **Use o4-mini for WebJudge**: WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
|
76 |
|
77 |
## ⚠ Please submit the trajectory file with the following format:
|
78 |
The result of each task is stored in a folder named as its `task_id`, containing:
|
requirements.txt
CHANGED
@@ -2,5 +2,4 @@ datasets
|
|
2 |
gradio
|
3 |
huggingface-hub
|
4 |
numpy
|
5 |
-
APScheduler
|
6 |
-
plotly
|
|
|
2 |
gradio
|
3 |
huggingface-hub
|
4 |
numpy
|
5 |
+
APScheduler
|
|