jbragg commited on
Commit
5ef407f
·
unverified ·
1 Parent(s): f82c430

Leaderboard (#2)

Browse files
Files changed (7) hide show
  1. .gitignore +181 -0
  2. Dockerfile +44 -0
  3. README.md +15 -0
  4. app.py +644 -0
  5. content.py +55 -0
  6. requirements-dev.txt +2 -0
  7. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+
173
+ # Vim files
174
+ *.swp
175
+ *.swo
176
+ *.un~
177
+
178
+ # Misc
179
+ .DS_Store
180
+ .mise.toml
181
+ .vscode/
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+
4
+ # (0) Install SSH client tools (and git, if you're pulling via SSH)
5
+ RUN apt-get update && \
6
+ apt-get install -y --no-install-recommends openssh-client git && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ # The two following lines are requirements for the Dev Mode to be functional
10
+ # Learn more about the Dev Mode at https://huggingface.co/dev-mode-explorers
11
+ RUN useradd -m -u 1000 user
12
+ WORKDIR /app
13
+
14
+
15
+ # (2) Copy dependencies manifest
16
+ COPY --chown=user requirements.txt requirements.txt
17
+
18
+ # (3) Install dependencies, mounting SSH keys and optional HTTPS creds
19
+ RUN --mount=type=secret,id=AGENTEVAL_DEPLOY_KEY,mode=0400,required=true \
20
+ --mount=type=secret,id=ASTABENCH_DEPLOY_KEY,mode=0400,required=true \
21
+ mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
22
+ cat /run/secrets/AGENTEVAL_DEPLOY_KEY > /root/.ssh/id_ed25519 && chmod 600 /root/.ssh/id_ed25519 && \
23
+ cat /run/secrets/ASTABENCH_DEPLOY_KEY > /root/.ssh/id_astabench && chmod 600 /root/.ssh/id_astabench && \
24
+ ssh-keyscan github.com >> /root/.ssh/known_hosts && \
25
+ printf 'Host github.com\n User git\n IdentityFile /root/.ssh/id_ed25519\n IdentityFile /root/.ssh/id_astabench\n StrictHostKeyChecking no\n' >> /root/.ssh/config && \
26
+ # rewrite all GitHub HTTPS URLs to SSH so nested deps install via SSH
27
+ git config --global url."ssh://[email protected]/".insteadOf "https://github.com/" && \
28
+ pip install --no-cache-dir --upgrade -r requirements.txt
29
+
30
+ # (4) Copy in your Gradio app code
31
+ COPY . .
32
+ RUN mkdir -p /home/user/data && chown -R user:user /home/user/data
33
+
34
+ # Make the app treat this as non‑debug (so DATA_DIR=/home/user/data)
35
+ ENV system=spaces
36
+
37
+ # (5) Switch to a non-root user
38
+ USER user
39
+
40
+ # (6) Expose Gradio’s default port
41
+ EXPOSE 7860
42
+
43
+ # (7) Launch your app
44
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AstaBench Leaderboard
3
+ emoji: 🥇
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ hf_oauth: true
11
+ app_port: 7860
12
+ failure_strategy: none
13
+ tags:
14
+ - leaderboard
15
+ ---
app.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """app.py: Gradio app for the AstaBench leaderboard.
2
+
3
+ Modeled after the GAIA huggingface leaderboard app.
4
+
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import shutil
10
+ import tarfile
11
+ import tempfile
12
+ from datetime import datetime, timedelta, timezone
13
+ from email.utils import parseaddr
14
+ from pathlib import Path
15
+ from zoneinfo import ZoneInfo
16
+
17
+ import gradio as gr
18
+ import numpy as np
19
+ import pandas as pd
20
+ import requests
21
+ from agenteval import (
22
+ compute_summary_statistics,
23
+ process_eval_logs,
24
+ upload_folder_to_hf,
25
+ upload_summary_to_hf,
26
+ )
27
+ from agenteval.models import EvalResult
28
+ from agenteval.upload import sanitize_path_component
29
+ from apscheduler.schedulers.background import BackgroundScheduler
30
+ from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
31
+ from datasets.data_files import EmptyDatasetError
32
+ from huggingface_hub import HfApi
33
+
34
+ from content import (
35
+ CITATION_BUTTON_LABEL,
36
+ CITATION_BUTTON_TEXT,
37
+ INTRODUCTION_TEXT,
38
+ SUBMISSION_TEXT,
39
+ TITLE,
40
+ format_error,
41
+ format_log,
42
+ format_warning,
43
+ hf_uri_to_web_url,
44
+ hyperlink,
45
+ )
46
+
47
+ # Should be False on spaces and True outside
48
+ LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
49
+
50
+
51
+ CONFIG_NAME = "1.0.0-dev1"
52
+
53
+ IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
54
+
55
+ OWNER = "allenai"
56
+ PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
57
+ SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions" # all raw and scored submissions (val and test)
58
+ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public" # copy scored val submissions (public for transparency - not used for rendering leaderboard)
59
+ CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
60
+ RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results" # just the summary score statistics (val and test), to be displayed on the leaderboard
61
+ LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
62
+
63
+ if LOCAL_DEBUG:
64
+ DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
65
+ else:
66
+ DATA_DIR = "/home/user/data/" + CONFIG_NAME
67
+ EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
68
+
69
+ api = HfApi()
70
+
71
+ # max upload size of 100MB
72
+ MAX_UPLOAD_BYTES = 100 * 1024**2
73
+
74
+ AGENTEVAL_MANIFEST_NAME = "agenteval.json"
75
+
76
+
77
+ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
78
+
79
+
80
+ def try_load_dataset(*args, **kwargs) -> DatasetDict:
81
+ try:
82
+ return load_dataset(*args, **kwargs)
83
+ except EmptyDatasetError:
84
+ return DatasetDict()
85
+ except ValueError:
86
+ return DatasetDict()
87
+
88
+
89
+ def pretty_column_name(col: str) -> str:
90
+ """Map any raw column name to its display name."""
91
+ # text columns
92
+ if col == "submit_time":
93
+ return "Submission date"
94
+ elif col == "agent_name":
95
+ return "Agent"
96
+ elif col == "agent_description":
97
+ return "Agent description"
98
+ elif col == "username":
99
+ return "User/organization"
100
+ elif col == "logs_url":
101
+ return "Logs"
102
+ # cost → $
103
+ if col.endswith("/cost"):
104
+ return "$"
105
+ # stderr → CI
106
+ elif col.endswith("/cost_stderr") or col.endswith("/score_stderr"):
107
+ return "CI"
108
+ # overall score
109
+ elif col == "overall/score":
110
+ return "Overall"
111
+ # any other score → its tag/task name
112
+ elif col.endswith("/score"):
113
+ return col.split("/")[1]
114
+ # fallback to unchanged
115
+ return col
116
+
117
+
118
+ def get_dataframe_from_results(eval_results: DatasetDict, split: str):
119
+ local_df = eval_results.get(split)
120
+ # return default if split is missing or contains no records
121
+ if local_df is None or len(local_df) == 0:
122
+ default_raw_cols = [
123
+ "agent_name",
124
+ "agent_description",
125
+ "username",
126
+ "submit_time",
127
+ ]
128
+ pretty_cols = [pretty_column_name(c) for c in default_raw_cols]
129
+ return pd.DataFrame({col: ["No data"] for col in pretty_cols})
130
+
131
+ # Use the first suite_config for all rows
132
+ # because the suite_config should not change given a single CONFIG_NAME
133
+ first_suite_config = None
134
+ if len(local_df) > 0:
135
+ first_suite_config = EvalResult.model_validate(local_df[0]).suite_config
136
+
137
+ def extract_scores(eval_res: EvalResult) -> dict[str, float | None]:
138
+ summary_stats = compute_summary_statistics(
139
+ suite_config=first_suite_config,
140
+ split=split,
141
+ results=eval_res.results,
142
+ )
143
+
144
+ values: dict[str, float | None] = {}
145
+ for key in summary_stats:
146
+ if key == "overall":
147
+ values["overall/score"] = summary_stats[key].score
148
+ values["overall/cost"] = summary_stats[key].cost
149
+ elif key.startswith("tag/"):
150
+ tag = key.split("/")[1]
151
+ values[f"tag/{tag}/score"] = summary_stats[key].score
152
+ values[f"tag/{tag}/cost"] = summary_stats[key].cost
153
+ elif key.startswith("task/"):
154
+ task = key.split("/")[1]
155
+ values[f"task/{task}/score"] = summary_stats[key].score
156
+ values[f"task/{task}/score_stderr"] = summary_stats[key].score_stderr
157
+ values[f"task/{task}/cost"] = summary_stats[key].cost
158
+ values[f"task/{task}/cost_stderr"] = summary_stats[key].cost_stderr
159
+ return values
160
+
161
+ def format_row(row) -> dict[str, float | str | None]:
162
+ eval_res = EvalResult.model_validate(row)
163
+ sub = eval_res.submission
164
+ sub.submit_time = sub.submit_time or datetime(1970, 1, 1, 0, 0, 0)
165
+ data = {
166
+ "submit_time": sub.submit_time.astimezone(ZoneInfo("US/Pacific")).strftime(
167
+ "%Y-%m-%d"
168
+ ),
169
+ "agent_name": (
170
+ hyperlink(sub.agent_url, sub.agent_name)
171
+ if sub.agent_url
172
+ else sub.agent_name
173
+ ),
174
+ "agent_description": sub.agent_description or "",
175
+ "username": sub.username or "",
176
+ **extract_scores(eval_res),
177
+ "logs_url": (
178
+ hyperlink(
179
+ hf_uri_to_web_url(
180
+ sub.logs_url if IS_INTERNAL else sub.logs_url_public
181
+ ),
182
+ "🔗",
183
+ )
184
+ if (sub.logs_url or sub.logs_url_public)
185
+ else ""
186
+ ),
187
+ }
188
+ return data
189
+
190
+ local_df = local_df.map(format_row)
191
+
192
+ df = pd.DataFrame(local_df)
193
+
194
+ # Multiply score, cost, and stderr values by 100 and round to 1 decimal
195
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
196
+ df[numeric_cols] = df[numeric_cols].multiply(100).round(1)
197
+
198
+ # Build column order on raw names, then rename via pretty_column_name
199
+ all_cols = df.columns.tolist()
200
+ base = ["agent_name", "agent_description", "username"]
201
+ overall = ["overall/score", "overall/cost"]
202
+ tags = sorted({c.split("/")[1] for c in all_cols if c.startswith("tag/")})
203
+ tasks = sorted({c.split("/")[1] for c in all_cols if c.startswith("task/")})
204
+ rest = ["submit_time", "logs_url"]
205
+ column_order = (
206
+ base
207
+ + overall
208
+ + [col for tag in tags for col in (f"tag/{tag}/score", f"tag/{tag}/cost")]
209
+ + [
210
+ col
211
+ for t in tasks
212
+ for col in (
213
+ f"task/{t}/score",
214
+ f"task/{t}/score_stderr",
215
+ f"task/{t}/cost",
216
+ f"task/{t}/cost_stderr",
217
+ )
218
+ ]
219
+ + rest
220
+ )
221
+ df = df.reindex(columns=[c for c in column_order if c in all_cols])
222
+ # sort by overall score (descending)
223
+ df = df.sort_values(by=["overall/score"], ascending=False)
224
+ # apply all renames via pretty_column_name
225
+ orig_cols = df.columns.tolist()
226
+ df.columns = [pretty_column_name(col) for col in orig_cols]
227
+
228
+ # blank out any null/NaN cells
229
+ df = df.fillna("")
230
+
231
+ return df
232
+
233
+
234
+ def load_and_format_dataframes():
235
+ eval_results = try_load_dataset(
236
+ RESULTS_DATASET,
237
+ CONFIG_NAME,
238
+ download_mode="force_redownload",
239
+ verification_mode=VerificationMode.NO_CHECKS,
240
+ trust_remote_code=True,
241
+ )
242
+ eval_dataframe_val = get_dataframe_from_results(
243
+ eval_results=eval_results, split="validation"
244
+ )
245
+ eval_dataframe_test = get_dataframe_from_results(
246
+ eval_results=eval_results, split="test"
247
+ )
248
+ return eval_results, eval_dataframe_val, eval_dataframe_test
249
+
250
+
251
+ # Display the results
252
+ eval_results, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
253
+
254
+
255
+ def restart_space():
256
+ api.restart_space(repo_id=LEADERBOARD_PATH)
257
+
258
+
259
+ def checked_upload_folder(
260
+ api,
261
+ folder_path: str,
262
+ repo_id: str,
263
+ config_name: str,
264
+ split: str,
265
+ submission_name: str,
266
+ ) -> str:
267
+ """Upload with inline size check; raises ValueError if too large."""
268
+ total = 0
269
+ for root, _, files in os.walk(folder_path):
270
+ for f in files:
271
+ total += os.path.getsize(os.path.join(root, f))
272
+ if total > MAX_UPLOAD_BYTES:
273
+ raise ValueError(
274
+ f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
275
+ )
276
+ # NOTE: This function raises ValueError if unsafe characters are found in the path.
277
+ return upload_folder_to_hf(
278
+ api=api,
279
+ folder_path=folder_path,
280
+ repo_id=repo_id,
281
+ config_name=config_name,
282
+ split=split,
283
+ submission_name=submission_name,
284
+ )
285
+
286
+
287
+ def add_new_eval(
288
+ val_or_test: str,
289
+ agent_name: str | None,
290
+ agent_description: str,
291
+ agent_url: str,
292
+ path_to_file: tempfile._TemporaryFileWrapper | None,
293
+ username: str,
294
+ mail: str,
295
+ profile: gr.OAuthProfile,
296
+ ):
297
+ # default username if none provided
298
+ if not username or username.strip() == "":
299
+ username = profile.username
300
+
301
+ if not agent_name:
302
+ return format_warning("Please provide an agent name.")
303
+
304
+ submission_time = datetime.now(timezone.utc)
305
+
306
+ # Was the profile created less than 2 month ago?
307
+ user_data = requests.get(
308
+ f"https://huggingface.co/api/users/{profile.username}/overview"
309
+ )
310
+ creation_date = json.loads(user_data.content)["createdAt"]
311
+
312
+ created_at = datetime.strptime(creation_date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
313
+ tzinfo=timezone.utc
314
+ )
315
+ if submission_time - created_at < timedelta(days=60):
316
+ return format_error("This account is not authorized to submit here.")
317
+
318
+ contact_infos = try_load_dataset(
319
+ CONTACT_DATASET,
320
+ CONFIG_NAME,
321
+ download_mode="force_redownload",
322
+ verification_mode=VerificationMode.NO_CHECKS,
323
+ trust_remote_code=True,
324
+ )
325
+ user_submission_dates = sorted(
326
+ datetime.fromisoformat(row["submit_time"])
327
+ for row in contact_infos.get(val_or_test, [])
328
+ if row["username_auth"] == profile.username
329
+ )
330
+ if len(user_submission_dates) > 0 and abs(
331
+ submission_time - user_submission_dates[-1]
332
+ ) < timedelta(seconds=24 * 60 * 60):
333
+ return format_error(
334
+ "You already submitted once in the last 24h; please try again later."
335
+ )
336
+
337
+ is_validation = val_or_test == "validation"
338
+
339
+ # Very basic email parsing
340
+ _, parsed_mail = parseaddr(mail)
341
+ if "@" not in parsed_mail:
342
+ return format_warning("Please provide a valid email adress.")
343
+
344
+ # Check duplicate submissions by inspecting the nested "submission" dicts
345
+ if val_or_test in eval_results and len(eval_results[val_or_test]) > 0:
346
+ existing = eval_results[val_or_test]
347
+ subs = existing.to_dict().get("submission", [])
348
+ names = {item.get("agent_name", "").lower() for item in subs}
349
+ users = {item.get("username", "").lower() for item in subs}
350
+ if agent_name.lower() in names and username.lower() in users:
351
+ return format_warning("This agent has been already submitted.")
352
+
353
+ if path_to_file is None:
354
+ return format_warning("Please attach a file.")
355
+
356
+ # sanitize username and agent_name for filesystem
357
+ safe_username = sanitize_path_component(username)
358
+ safe_agent_name = sanitize_path_component(agent_name)
359
+
360
+ extracted_dir = os.path.join(
361
+ EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}"
362
+ )
363
+
364
+ if LOCAL_DEBUG:
365
+ print("mock extracted file", flush=True)
366
+ else:
367
+ try:
368
+ # 1) remove old extraction if present
369
+ if os.path.exists(extracted_dir):
370
+ shutil.rmtree(extracted_dir)
371
+ os.makedirs(extracted_dir, exist_ok=True)
372
+
373
+ # 2) securely extract only regular files, flatten structure
374
+ # Flatten structure to aid finding the manifest agenteval.json file
375
+ # and because hierarchical structure is not needed
376
+ with tarfile.open(path_to_file.name, "r:gz") as tar:
377
+ for member in tar.getmembers():
378
+ if not member.isreg():
379
+ continue
380
+ fname = os.path.basename(member.name)
381
+ # skip empty or hidden
382
+ if not fname or fname.startswith("."):
383
+ continue
384
+ fobj = tar.extractfile(member)
385
+ if not fobj:
386
+ continue
387
+ target = os.path.join(extracted_dir, fname)
388
+ with open(target, "wb") as out:
389
+ out.write(fobj.read())
390
+
391
+ # 3) ensure something was extracted
392
+ if not os.listdir(extracted_dir):
393
+ return format_error("Submission tarball is empty or invalid.")
394
+
395
+ except Exception as e:
396
+ return format_error(
397
+ f"Error while extracting the file: {e}. Be sure to upload a valid .tar.gz file."
398
+ )
399
+
400
+ submission_name = (
401
+ f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d')}"
402
+ )
403
+
404
+ # SAVE UNSCORED SUBMISSION
405
+ if LOCAL_DEBUG:
406
+ print("mock uploaded submission", flush=True)
407
+ else:
408
+ try:
409
+ checked_upload_folder(
410
+ api=api,
411
+ folder_path=extracted_dir,
412
+ repo_id=SUBMISSION_DATASET,
413
+ config_name=CONFIG_NAME,
414
+ split=val_or_test,
415
+ submission_name=submission_name,
416
+ )
417
+ except ValueError as e:
418
+ return format_error(str(e))
419
+
420
+ # SAVE CONTACT
421
+ contact_info = {
422
+ "agent_name": agent_name,
423
+ "agent_description": agent_description,
424
+ "url": agent_url,
425
+ "username": username,
426
+ "username_auth": profile.username,
427
+ "mail": mail,
428
+ "submit_time": submission_time.isoformat(),
429
+ }
430
+ # add or init contact dataset for this split
431
+ if val_or_test in contact_infos:
432
+ contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
433
+ else:
434
+ contact_infos[val_or_test] = Dataset.from_list([contact_info])
435
+ if LOCAL_DEBUG:
436
+ print("mock uploaded contact info", flush=True)
437
+ else:
438
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
439
+
440
+ try:
441
+ json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
442
+ if not json_path.exists():
443
+ return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME}")
444
+ raw = json_path.read_text(encoding="utf-8")
445
+ eval_result = EvalResult.model_validate_json(raw)
446
+ if eval_result.suite_config.version != CONFIG_NAME:
447
+ return format_error(
448
+ f"Error: submitted suite version {eval_result.suite_config.version} "
449
+ f"does not match currently accepted version {CONFIG_NAME}"
450
+ )
451
+ if eval_result.split != val_or_test:
452
+ return format_error(
453
+ f"Error: uploaded split {eval_result.split} does not match selected split {val_or_test}"
454
+ )
455
+
456
+ # NOTE: Trusting user-computed scores, but re-computing the derived results based on the log files
457
+ eval_result.results = process_eval_logs(extracted_dir)[0]
458
+ eval_result.save_json(str(json_path))
459
+
460
+ except Exception as e:
461
+ return format_error(
462
+ f"Error while scoring the submission: {e}. Be sure to upload a valid submission."
463
+ )
464
+
465
+ # # SAVE SCORED SUBMISSION
466
+ if LOCAL_DEBUG:
467
+ print("mock uploaded scored submission")
468
+ else:
469
+ try:
470
+ logs_url_private = checked_upload_folder(
471
+ api=api,
472
+ folder_path=extracted_dir,
473
+ repo_id=SUBMISSION_DATASET,
474
+ config_name=CONFIG_NAME,
475
+ split=val_or_test,
476
+ submission_name=f"{submission_name}_scored",
477
+ )
478
+ except ValueError as e:
479
+ return format_error(str(e))
480
+
481
+ # Validation submissions are public for public leaderboard
482
+ if is_validation and not IS_INTERNAL:
483
+ try:
484
+ logs_url_public = checked_upload_folder(
485
+ api=api,
486
+ folder_path=extracted_dir,
487
+ repo_id=SUBMISSION_DATASET_PUBLIC,
488
+ config_name=CONFIG_NAME,
489
+ split=val_or_test,
490
+ submission_name=f"{submission_name}_scored",
491
+ )
492
+ except ValueError as e:
493
+ return format_error(str(e))
494
+ else:
495
+ logs_url_public = None
496
+
497
+ eval_result.submission.agent_name = agent_name
498
+ eval_result.submission.agent_description = agent_description
499
+ eval_result.submission.agent_url = agent_url
500
+ eval_result.submission.username = username
501
+ eval_result.submission.submit_time = submission_time
502
+ eval_result.submission.logs_url = logs_url_private
503
+ eval_result.submission.logs_url_public = logs_url_public
504
+
505
+ if LOCAL_DEBUG:
506
+ print("mock uploaded results to lb")
507
+ else:
508
+ upload_summary_to_hf(
509
+ api=api,
510
+ eval_result=eval_result,
511
+ repo_id=RESULTS_DATASET,
512
+ config_name=CONFIG_NAME,
513
+ split=val_or_test,
514
+ submission_name=f"{submission_name}_scored",
515
+ )
516
+
517
+ return format_log(
518
+ f"Agent {agent_name} submitted by {username} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed."
519
+ )
520
+
521
+
522
+ def refresh():
523
+ _, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
524
+ return eval_dataframe_val, eval_dataframe_test
525
+
526
+
527
+ # Determine column types dynamically based on dataframe columns
528
+ def compute_column_types(df):
529
+ col_types = []
530
+ for col in df.columns:
531
+ if col == "Agent":
532
+ col_types.append("markdown")
533
+ elif col in ["Agent description", "User/organization", "Submission date"]:
534
+ col_types.append("str")
535
+ elif col == "Logs":
536
+ col_types.append("markdown")
537
+ else:
538
+ col_types.append("number")
539
+ return col_types
540
+
541
+
542
+ test_col_types = compute_column_types(eval_dataframe_test)
543
+ val_col_types = compute_column_types(eval_dataframe_val)
544
+
545
+ demo = gr.Blocks()
546
+ with demo:
547
+ gr.HTML(TITLE)
548
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
549
+
550
+ with gr.Row():
551
+ with gr.Accordion("📙 Citation", open=False):
552
+ citation_button = gr.Textbox(
553
+ value=CITATION_BUTTON_TEXT,
554
+ label=CITATION_BUTTON_LABEL,
555
+ elem_id="citation-button",
556
+ ) # .style(show_copy_button=True)
557
+
558
+ leaderboard_table_test = gr.Dataframe(
559
+ value=eval_dataframe_test,
560
+ headers=list(eval_dataframe_test.columns),
561
+ datatype=test_col_types,
562
+ interactive=False,
563
+ column_widths=["20%"],
564
+ render=False,
565
+ )
566
+
567
+ leaderboard_table_val = gr.Dataframe(
568
+ value=eval_dataframe_val,
569
+ headers=list(eval_dataframe_val.columns),
570
+ datatype=val_col_types,
571
+ interactive=False,
572
+ column_widths=["20%"],
573
+ render=False,
574
+ )
575
+
576
+ # Build tab layout list based on desired order
577
+ tabs = [
578
+ ("Results: Test", leaderboard_table_test),
579
+ ("Results: Validation", leaderboard_table_val),
580
+ ]
581
+
582
+ if IS_INTERNAL:
583
+ tabs = [tabs[1], tabs[0]] # Validation first for internal users
584
+
585
+ # Render the tabs in desired order
586
+ for label, component in tabs:
587
+ with gr.Tab(label):
588
+ component.render()
589
+
590
+ refresh_button = gr.Button("Refresh")
591
+ refresh_button.click(
592
+ refresh,
593
+ inputs=[],
594
+ outputs=[
595
+ leaderboard_table_val,
596
+ leaderboard_table_test,
597
+ ],
598
+ )
599
+ with gr.Accordion("Submit a new agent for evaluation"):
600
+ with gr.Row():
601
+ gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
602
+ with gr.Row():
603
+ with gr.Column():
604
+ level_of_test = gr.Radio(
605
+ ["validation", "test"], value="validation", label="Split"
606
+ )
607
+ agent_name_textbox = gr.Textbox(label="Agent name")
608
+ agent_description_textbox = gr.Textbox(label="Agent description")
609
+ agent_url_textbox = gr.Textbox(label="Url to agent information")
610
+ with gr.Column():
611
+ username = gr.Textbox(
612
+ label="Organization or user name (defaults to your HF username)",
613
+ placeholder="Leave blank to use your HF username",
614
+ )
615
+ mail = gr.Textbox(
616
+ label="Contact email (will be stored privately, & used if there is an issue with your submission)"
617
+ )
618
+ file_output = gr.File()
619
+
620
+ with gr.Row():
621
+ gr.LoginButton()
622
+ submit_button = gr.Button("Submit Eval")
623
+ submission_result = gr.Markdown()
624
+ submit_button.click(
625
+ add_new_eval,
626
+ [
627
+ level_of_test,
628
+ agent_name_textbox,
629
+ agent_description_textbox,
630
+ agent_url_textbox,
631
+ file_output,
632
+ username,
633
+ mail,
634
+ ],
635
+ submission_result,
636
+ )
637
+
638
+ scheduler = BackgroundScheduler()
639
+ scheduler.add_job(restart_space, "interval", seconds=3600)
640
+ scheduler.start()
641
+ if LOCAL_DEBUG:
642
+ demo.launch(debug=True)
643
+ else:
644
+ demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
content.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">AstaBench Leaderboard</h1>"""
2
+
3
+ INTRODUCTION_TEXT = """
4
+ ## Introduction
5
+ """
6
+
7
+ SUBMISSION_TEXT = """
8
+ ## Submissions
9
+ """
10
+
11
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
12
+ CITATION_BUTTON_TEXT = r"""@article{asta-bench,
13
+ title={AstaBench},
14
+ author={AstaBench folks},
15
+ year={2025},
16
+ eprint={TBD.TBD},
17
+ archivePrefix={arXiv},
18
+ primaryClass={cs.AI},
19
+ secondaryClass={cs.CL}
20
+ }"""
21
+
22
+
23
+ def format_error(msg):
24
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
25
+
26
+
27
+ def format_warning(msg):
28
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
29
+
30
+
31
+ def format_log(msg):
32
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
33
+
34
+
35
+ def hyperlink(link, text):
36
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{text}</a>'
37
+
38
+
39
+ def hf_uri_to_web_url(uri: str) -> str:
40
+ """
41
+ Convert a Hugging Face-style URI like:
42
+ hf://datasets/{namespace}/{repo}/{path...}
43
+ into a public web URL:
44
+ https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path...}
45
+ """
46
+ prefix = "hf://datasets/"
47
+ if not uri.startswith(prefix):
48
+ raise ValueError("URI must start with 'hf://datasets/'")
49
+
50
+ parts = uri[len(prefix) :].split("/", 2)
51
+ if len(parts) < 3:
52
+ raise ValueError("Expected format: hf://datasets/{namespace}/{repo}/{path...}")
53
+
54
+ namespace, repo, path = parts
55
+ return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"
requirements-dev.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ black
2
+ isort
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ datasets
2
+ gradio[oauth]
3
+ huggingface-hub
4
+ APScheduler
5
+ agent-eval @ git+https://github.com/allenai/agent-eval.git@d302cb5d0ba983ae5f0764c53fde4e017118d0df#egg=agent-eval