Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

App Files Files Community

Online_Mind2Web_Leaderboard / app.py

WeijianQi1999

remove some comments

86aa3b5 3 days ago

raw

history blame contribute delete

12.2 kB

	import os
	import gradio as gr
	import pandas as pd
	import json
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from collections import Counter
	from apscheduler.schedulers.background import BackgroundScheduler
	import numpy as np

	from scorer import question_scorer
	from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION

	TOKEN = os.environ.get("TOKEN", None)

	OWNER = "Online-Mind2Web"
	YEAR_VERSION = "2025"
	LOCAL_DEBUG = True

	def _format_sr_column(series: pd.Series) -> pd.Series:
	numeric = pd.to_numeric(series, errors="coerce")
	out = numeric.map(lambda x: f"{x:.1f}" if pd.notna(x) else "")

	# Wherever coercion failed (original was str / NaN), restore original value
	mask = numeric.isna() & series.notna()
	out[mask] = series[mask]
	return out

	def get_dataframe_from_results(eval_path):
	df = pd.read_csv(eval_path)

	if "Verified" not in df.columns:
	df = df.sort_values(by=["Average SR"], ascending=False)
	else:
	df = df.sort_values(
	by=["Verified", "Average SR"],
	ascending=[False, False],
	kind="mergesort"
	)

	for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
	if col in df.columns:
	df[col] = _format_sr_column(df[col])

	return df

	auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
	human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
	TYPES = ["str", "str", "str", "markdown", "number", "number", "number", "number", "str", "str", "markdown", "str"]

	def df_to_gradio(df: pd.DataFrame) -> dict:
	display_df = df.drop(columns=["Release Time"], errors="ignore")
	headers = display_df.columns.tolist()
	data = display_df.values.tolist()
	# simple styling: red if not verified
	if "Verified" in display_df.columns:
	verified_idx = headers.index("Verified")
	styling = [["background-color:#ffcccc" if not row[verified_idx] else "" for _ in headers] for row in data]
	else:
	styling = [["" for _ in headers] for _ in data]
	return {"data": data, "headers": headers, "metadata": {"styling": styling}}

	def gradio_plot_wrapper(json_file):
	return plot_heatmap_with_performance_bar(json_file.name)

	def style_auto_df(df: pd.DataFrame):
	def _row_style(row):
	bg = "background-color: #ffcccc" if row["Verified"] != True else ""
	return [bg] * len(row)

	styler = df.style.apply(_row_style, axis=1)
	try:
	styler = styler.hide(axis="index")
	except Exception:
	pass
	return styler

	def nice_bounds(low: float, high: float) -> tuple[float, float]:
	if low == high:
	low -= 1; high += 1
	return (np.floor(low / 5) * 5, np.ceil(high / 5) * 5)

	def plot_sr_vs_time(df: pd.DataFrame, title: str = "Success rate over time") -> go.Figure:

	work = df[df["Verified"] == True].copy() # filter out unverified rows

	work["Release Time"] = pd.to_datetime(work["Release Time"], errors="coerce")
	work["Average SR"] = pd.to_numeric(work["Average SR"], errors="coerce")
	work = work.dropna(subset=["Release Time", "Average SR"])

	agents = work["Agent"].unique().tolist()
	color_map = {a: f"hsl({int(360*i/len(agents))},70%,45%)" for i, a in enumerate(agents)}

	fig = go.Figure()

	y_min_raw, y_max_raw = work["Average SR"].min(), work["Average SR"].max()
	y_min, y_max = nice_bounds(y_min_raw, y_max_raw)
	band_edges = np.linspace(y_min, y_max, 4)
	band_cols = ["rgba(226,247,226,0.35)", "rgba(255,255,204,0.35)", "rgba(255,228,225,0.35)"]
	shapes = [
	dict(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=band_edges[i], y1=band_edges[i+1],
	fillcolor=band_cols[i], line_width=0)
	for i in range(3)
	]

	for _, row in work.iterrows():
	fig.add_trace(
	go.Scatter(
	x=[row["Release Time"]],
	y=[row["Average SR"]],
	mode="markers+text",
	text=[row["Agent"]],
	textposition="top center",
	textfont=dict(size=11),
	marker=dict(size=10, color=color_map[row["Agent"]], opacity=0.9),
	hovertemplate="Agent: %{text}<br>SR: %{y:.1f}%<br>Date: %{x\|%Y-%m}<extra></extra>",
	showlegend=False,
	)
	)

	if len(work) >= 2:
	x_numeric = work["Release Time"].map(pd.Timestamp.toordinal)
	slope, intercept = np.polyfit(x_numeric, work["Average SR"], 1)
	x_range = pd.date_range(work["Release Time"].min(), work["Release Time"].max(), freq="MS")
	y_pred = slope * x_range.map(pd.Timestamp.toordinal) + intercept
	fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode="lines", line=dict(color="rgba(0,0,0,0.6)", dash="dash"), name="Trend", hoverinfo="skip"))

	fig.update_layout(
	title=dict(text=title, x=0.5, xanchor="center", font=dict(size=22)),
	xaxis_title="Release Time",
	yaxis_title="Success Rate",
	template="plotly_white",
	width=1800, height=800,
	shapes=shapes,
	)
	fig.update_xaxes(dtick="M1", tickformat="%Y-%m", showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
	fig.update_yaxes(showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
	return fig

	def plot_heatmap_with_performance_bar(json_file):
	with open(json_file, "r") as f:
	data = json.load(f)

	agents = [k for k in data[0].keys() if k.endswith("_human_label")]
	records = []
	original_ids = [task["task_id"] for task in data]

	for task in data:
	task_id = task["task_id"]
	for agent in agents:
	raw_val = task.get(agent, "0")
	try:
	val = int(raw_val)
	except ValueError:
	val = 0
	val = 1 if val == 1 else 0
	records.append({
	"Task ID": task_id,
	"Agent": agent.replace("_human_label", ""),
	"Success": val
	})

	df = pd.DataFrame(records)
	pivot = df.pivot_table(index="Agent", columns="Task ID", values="Success", aggfunc="max")

	for task_id in original_ids:
	if task_id not in pivot.columns:
	pivot[task_id] = 0
	pivot = pivot[original_ids]

	agent_success_rate = pivot.sum(axis=1) / pivot.shape[1]
	pivot["SuccessRate"] = agent_success_rate
	pivot = pivot.sort_values(by="SuccessRate", ascending=False)
	pivot = pivot.drop(columns=["SuccessRate"])

	agent_name_map = {
	"Operator": "Operator",
	"Agent-E": "Agent-E",
	"Browser_Use": "Browser Use",
	"Claude_Computer_Use": "Claude Computer Use",
	"SeeAct": "SeeAct"
	}
	sorted_agents = pivot.index.tolist()
	pivot.index = [
	f"{agent_name_map.get(agent, agent)} ({agent_success_rate[agent]*100:.1f}%)"
	for agent in sorted_agents
	]

	custom_labels = [["Success" if val == 1 else "Failure" for val in row] for row in pivot.values]
	any_agent_solved = pivot.max(axis=0).sum()
	best_agent_solved = pivot.sum(axis=1).max()
	total_tasks = len(original_ids)

	fig = make_subplots(
	rows=2, cols=1,
	row_heights=[0.8, 0.2],
	vertical_spacing=0.08,
	subplot_titles=("TASK ID", ""),
	shared_xaxes=False
	)

	fig.add_trace(go.Heatmap(
	z=pivot.values,
	x=pivot.columns,
	y=pivot.index,
	colorscale=[[0, "white"], [1, "skyblue"]],
	zmin=0,
	zmax=1,
	showscale=False,
	customdata=custom_labels,
	hovertemplate="Agent: %{y}<br>Task ID: %{x}<br>Completion: %{customdata}<extra></extra>"
	), row=1, col=1)

	fig.add_trace(go.Bar(
	y=["Any agent", "Best agent"],
	x=[any_agent_solved, best_agent_solved],
	orientation='h',
	marker_color=["dodgerblue", "mediumseagreen"],
	text=[
	f"{int(any_agent_solved)}/{total_tasks} ({any_agent_solved / total_tasks:.1%})",
	f"{int(best_agent_solved)}/{total_tasks} ({best_agent_solved / total_tasks:.1%})"
	],
	textposition="auto",
	showlegend=False
	), row=2, col=1)

	fig.add_trace(go.Scatter(
	x=[None], y=[None],
	mode='markers',
	marker=dict(size=10, color='skyblue'),
	name='Success'
	))
	fig.add_trace(go.Scatter(
	x=[None], y=[None],
	mode='markers',
	marker=dict(size=10, color='white', line=dict(width=1, color='black')),
	name='Failure'
	))

	fig.update_xaxes(range=[0, total_tasks], row=2, col=1)
	fig.update_layout(
	height=600,
	xaxis=dict(showticklabels=False),
	yaxis=dict(title="Agent"),
	yaxis2=dict(title=""),
	margin=dict(t=60)
	)
	return fig

	def refresh():
	auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
	human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
	sr_time_plot = plot_sr_vs_time(auto_eval_dataframe_test)
	auto_eval_dataframe_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")

	return auto_eval_dataframe_test, human_eval_dataframe_test, sr_time_plot


	demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")

	with demo:
	gr.HTML(TITLE)
	gr.HTML(LINKS)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	elem_id="citation-button",
	lines=10,
	)

	gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")

	with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
	human_leaderboard_table_test = gr.Dataframe(
	value=human_eval_dataframe_test,
	datatype=TYPES,
	interactive=False,
	wrap=False
	)
	gr.Markdown("### Visualization")
	gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
	fig = plot_heatmap_with_performance_bar("./human_label.json")
	gr.Plot(fig)
	gr.Markdown(EVALUATION_DETAILS)

	with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
	sr_time_plot = gr.Plot(plot_sr_vs_time(auto_eval_dataframe_test))
	gr.Markdown('### Agents highlighted in red represent unverified results that may involve unreliable evaluations and are provided for reference only. You can refer to the "Note" column for more details.')
	auto_leaderboard_table_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")


	with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
	with gr.Row():
	gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")

	refresh_button = gr.Button("Refresh")
	refresh_button.click(
	refresh,
	inputs=[],
	outputs=[
	auto_leaderboard_table_test,
	human_leaderboard_table_test,
	sr_time_plot
	],
	)


	scheduler = BackgroundScheduler()
	scheduler.start()

	if __name__ == "__main__":
	demo.launch(debug=True,share=True)