Spaces:
Running
Running
Jason
commited on
Jason/inttest and contact record improvements for reviewer (#97)
Browse files- .gitattributes +1 -0
- .github/workflows/integration-tests.yml +44 -0
- app.py +0 -1
- requirements-dev.txt +2 -0
- submission.py +25 -22
- tests/integration/test-submission.tar.gz +3 -0
- tests/integration/test_submission.py +110 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tests/integration/test-submission.tar.gz filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/integration-tests.yml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Integration Tests
|
2 |
+
|
3 |
+
on:
|
4 |
+
pull_request:
|
5 |
+
branches: [ main ]
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
integration-test:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
|
11 |
+
environment:
|
12 |
+
name: testing
|
13 |
+
|
14 |
+
steps:
|
15 |
+
- uses: actions/checkout@v4
|
16 |
+
with:
|
17 |
+
lfs: true
|
18 |
+
|
19 |
+
- name: Set up Python 3.11
|
20 |
+
uses: actions/setup-python@v4
|
21 |
+
with:
|
22 |
+
python-version: '3.11'
|
23 |
+
|
24 |
+
- name: Cache pip dependencies
|
25 |
+
uses: actions/cache@v3
|
26 |
+
with:
|
27 |
+
path: ~/.cache/pip
|
28 |
+
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
|
29 |
+
restore-keys: |
|
30 |
+
${{ runner.os }}-pip-
|
31 |
+
|
32 |
+
- name: Install dependencies
|
33 |
+
run: |
|
34 |
+
python -m pip install --upgrade pip
|
35 |
+
pip install -r requirements.txt
|
36 |
+
pip install -r requirements-dev.txt
|
37 |
+
|
38 |
+
- name: Run integration tests
|
39 |
+
run: |
|
40 |
+
pytest tests/integration/ -v --tb=short
|
41 |
+
env:
|
42 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
43 |
+
HF_CONFIG: continuous-integration
|
44 |
+
IS_INTERNAL: true
|
app.py
CHANGED
@@ -237,7 +237,6 @@ def restart_space_job():
|
|
237 |
if __name__ == "__main__":
|
238 |
if LOCAL_DEBUG:
|
239 |
print("Launching in LOCAL_DEBUG mode.")
|
240 |
-
def get_initial_global_tag_choices(): return ["Overall"]
|
241 |
demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
|
242 |
else:
|
243 |
print("Launching in Space mode.")
|
|
|
237 |
if __name__ == "__main__":
|
238 |
if LOCAL_DEBUG:
|
239 |
print("Launching in LOCAL_DEBUG mode.")
|
|
|
240 |
demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
|
241 |
else:
|
242 |
print("Launching in Space mode.")
|
requirements-dev.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
black
|
2 |
isort
|
|
|
|
|
|
1 |
black
|
2 |
isort
|
3 |
+
pytest~=8.4.1
|
4 |
+
pytest-mock~=3.14.1
|
submission.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import logging
|
2 |
-
import
|
3 |
|
4 |
import matplotlib
|
5 |
from agenteval.cli import SUBMISSION_METADATA_FILENAME
|
@@ -12,18 +12,11 @@ matplotlib.use('Agg')
|
|
12 |
import os
|
13 |
import shutil
|
14 |
import tarfile
|
15 |
-
import tempfile
|
16 |
from datetime import datetime, timedelta, timezone
|
17 |
from email.utils import parseaddr
|
18 |
-
from pathlib import Path
|
19 |
|
20 |
import gradio as gr
|
21 |
import requests
|
22 |
-
from agenteval import (
|
23 |
-
process_eval_logs,
|
24 |
-
upload_folder_to_hf,
|
25 |
-
)
|
26 |
-
from agenteval.leaderboard.models import LeaderboardSubmission
|
27 |
from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
|
28 |
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
|
29 |
from datasets.data_files import EmptyDatasetError
|
@@ -34,8 +27,6 @@ from config import (
|
|
34 |
CONFIG_NAME,
|
35 |
CONTACT_DATASET,
|
36 |
EXTRACTED_DATA_DIR,
|
37 |
-
IS_INTERNAL,
|
38 |
-
LOCAL_DEBUG,
|
39 |
RESULTS_DATASET,
|
40 |
SUBMISSION_DATASET,
|
41 |
)
|
@@ -90,6 +81,7 @@ def upload_submission(
|
|
90 |
_validate_path_component(split, "split")
|
91 |
_validate_path_component(submission_name, "submission_name")
|
92 |
dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
|
|
|
93 |
api.upload_folder(
|
94 |
folder_path=folder_path,
|
95 |
path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
|
@@ -111,7 +103,7 @@ def add_new_eval(
|
|
111 |
agent_url: str,
|
112 |
openness: str | None,
|
113 |
degree_of_control: str | None,
|
114 |
-
path_to_file:
|
115 |
username: str,
|
116 |
role: str,
|
117 |
email: str,
|
@@ -173,13 +165,13 @@ def add_new_eval(
|
|
173 |
logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
|
174 |
contact_infos = try_load_dataset_submission(
|
175 |
CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
|
176 |
-
verification_mode=VerificationMode.NO_CHECKS
|
177 |
-
)
|
178 |
-
user_submission_dates = sorted(
|
179 |
-
datetime.fromisoformat(row["submit_time"])
|
180 |
-
for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
|
181 |
)
|
182 |
-
if
|
|
|
|
|
|
|
|
|
183 |
logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
|
184 |
return (
|
185 |
format_error("You already submitted once in the last 24h for this split; please try again later."), # error_message
|
@@ -262,7 +254,7 @@ def add_new_eval(
|
|
262 |
|
263 |
logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
|
264 |
try:
|
265 |
-
upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
|
266 |
except ValueError as e:
|
267 |
return (
|
268 |
format_error(str(e)), # error_message
|
@@ -280,11 +272,11 @@ def add_new_eval(
|
|
280 |
|
281 |
logger.info(f"agent {agent_name}: Save contact information")
|
282 |
contact_info = subm_meta.model_dump()
|
283 |
-
contact_info["submit_time"] = submission_time.isoformat()
|
284 |
contact_info["username_auth"] = profile.username
|
285 |
contact_info["email"] = email
|
286 |
contact_info["email_opt_in"] = email_opt_in
|
287 |
contact_info["role"] = role
|
|
|
288 |
|
289 |
logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
|
290 |
if val_or_test in contact_infos:
|
@@ -293,7 +285,11 @@ def add_new_eval(
|
|
293 |
contact_infos[val_or_test] = Dataset.from_list([contact_info])
|
294 |
|
295 |
try:
|
296 |
-
contact_infos.push_to_hub(
|
|
|
|
|
|
|
|
|
297 |
except Exception as e:
|
298 |
return (
|
299 |
format_error(f"Submission recorded, but contact info failed to save: {e}"), # error_message
|
@@ -304,7 +300,7 @@ def add_new_eval(
|
|
304 |
|
305 |
logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
|
306 |
return (
|
307 |
-
"", #
|
308 |
gr.update(visible=False), # error_modal
|
309 |
gr.update(visible=True), # success_modal
|
310 |
gr.update(visible=False) # loading_modal
|
@@ -319,6 +315,14 @@ def _is_hf_acct_too_new(submission_time: datetime, username: str):
|
|
319 |
return submission_time - created_at < timedelta(days=60)
|
320 |
|
321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
openness_label_html = f"""<div>
|
323 |
<b>Agent Openness</b>
|
324 |
{build_openness_tooltip_content()}
|
@@ -331,7 +335,6 @@ agent_tooling_label_html = f"""<div>
|
|
331 |
</div>"""
|
332 |
|
333 |
|
334 |
-
|
335 |
heading_html = """
|
336 |
<h2>🚀 Submit an agent for evaluation</h2>
|
337 |
<p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>
|
|
|
1 |
import logging
|
2 |
+
import typing
|
3 |
|
4 |
import matplotlib
|
5 |
from agenteval.cli import SUBMISSION_METADATA_FILENAME
|
|
|
12 |
import os
|
13 |
import shutil
|
14 |
import tarfile
|
|
|
15 |
from datetime import datetime, timedelta, timezone
|
16 |
from email.utils import parseaddr
|
|
|
17 |
|
18 |
import gradio as gr
|
19 |
import requests
|
|
|
|
|
|
|
|
|
|
|
20 |
from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
|
21 |
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
|
22 |
from datasets.data_files import EmptyDatasetError
|
|
|
27 |
CONFIG_NAME,
|
28 |
CONTACT_DATASET,
|
29 |
EXTRACTED_DATA_DIR,
|
|
|
|
|
30 |
RESULTS_DATASET,
|
31 |
SUBMISSION_DATASET,
|
32 |
)
|
|
|
81 |
_validate_path_component(split, "split")
|
82 |
_validate_path_component(submission_name, "submission_name")
|
83 |
dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
|
84 |
+
logger.info(f"Uploading dataset {dataset_url}")
|
85 |
api.upload_folder(
|
86 |
folder_path=folder_path,
|
87 |
path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
|
|
|
103 |
agent_url: str,
|
104 |
openness: str | None,
|
105 |
degree_of_control: str | None,
|
106 |
+
path_to_file: typing.IO | None,
|
107 |
username: str,
|
108 |
role: str,
|
109 |
email: str,
|
|
|
165 |
logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
|
166 |
contact_infos = try_load_dataset_submission(
|
167 |
CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
|
168 |
+
verification_mode=VerificationMode.NO_CHECKS
|
|
|
|
|
|
|
|
|
169 |
)
|
170 |
+
if _is_last_submission_too_recent(
|
171 |
+
contact_rows=contact_infos.get(val_or_test, []),
|
172 |
+
username=profile.username,
|
173 |
+
submission_time=submission_time,
|
174 |
+
):
|
175 |
logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
|
176 |
return (
|
177 |
format_error("You already submitted once in the last 24h for this split; please try again later."), # error_message
|
|
|
254 |
|
255 |
logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
|
256 |
try:
|
257 |
+
dataset_url = upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
|
258 |
except ValueError as e:
|
259 |
return (
|
260 |
format_error(str(e)), # error_message
|
|
|
272 |
|
273 |
logger.info(f"agent {agent_name}: Save contact information")
|
274 |
contact_info = subm_meta.model_dump()
|
|
|
275 |
contact_info["username_auth"] = profile.username
|
276 |
contact_info["email"] = email
|
277 |
contact_info["email_opt_in"] = email_opt_in
|
278 |
contact_info["role"] = role
|
279 |
+
contact_info["dataset_url"] = dataset_url
|
280 |
|
281 |
logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
|
282 |
if val_or_test in contact_infos:
|
|
|
285 |
contact_infos[val_or_test] = Dataset.from_list([contact_info])
|
286 |
|
287 |
try:
|
288 |
+
contact_infos.push_to_hub(
|
289 |
+
repo_id=CONTACT_DATASET,
|
290 |
+
config_name=CONFIG_NAME,
|
291 |
+
commit_message=f'Submission from hf user "{profile.username}" to "{dataset_url}"',
|
292 |
+
)
|
293 |
except Exception as e:
|
294 |
return (
|
295 |
format_error(f"Submission recorded, but contact info failed to save: {e}"), # error_message
|
|
|
300 |
|
301 |
logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
|
302 |
return (
|
303 |
+
"", # message
|
304 |
gr.update(visible=False), # error_modal
|
305 |
gr.update(visible=True), # success_modal
|
306 |
gr.update(visible=False) # loading_modal
|
|
|
315 |
return submission_time - created_at < timedelta(days=60)
|
316 |
|
317 |
|
318 |
+
def _is_last_submission_too_recent(contact_rows, username, submission_time):
|
319 |
+
user_submission_dates = sorted(
|
320 |
+
datetime.fromisoformat(row["submit_time"])
|
321 |
+
for row in contact_rows if row["username_auth"] == username
|
322 |
+
)
|
323 |
+
return user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1))
|
324 |
+
|
325 |
+
|
326 |
openness_label_html = f"""<div>
|
327 |
<b>Agent Openness</b>
|
328 |
{build_openness_tooltip_content()}
|
|
|
335 |
</div>"""
|
336 |
|
337 |
|
|
|
338 |
heading_html = """
|
339 |
<h2>🚀 Submit an agent for evaluation</h2>
|
340 |
<p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>
|
tests/integration/test-submission.tar.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34e9cba8a8431af3323b2dc8ce639f0dc058b4493c08eb65dbf20fe6936a27e8
|
3 |
+
size 8187172
|
tests/integration/test_submission.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
import gradio
|
6 |
+
import pytest
|
7 |
+
import pyarrow as pa
|
8 |
+
from agenteval.models import SubmissionMetadata
|
9 |
+
from datasets import load_dataset, VerificationMode
|
10 |
+
from huggingface_hub import HfApi, hf_hub_download
|
11 |
+
|
12 |
+
from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
|
13 |
+
from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
|
14 |
+
from submission import add_new_eval
|
15 |
+
|
16 |
+
_hf = HfApi()
|
17 |
+
|
18 |
+
|
19 |
+
class TestSubmission:
|
20 |
+
@pytest.fixture(autouse=True)
|
21 |
+
def setup(self):
|
22 |
+
# These need to be set before imports are evaluated so all we can do here
|
23 |
+
# is check that they have been set correctly.
|
24 |
+
assert IS_INTERNAL == True
|
25 |
+
assert CONFIG_NAME == "continuous-integration"
|
26 |
+
|
27 |
+
def test_add_new_eval(self, mocker):
|
28 |
+
# Bypass some checks so that the test can cover later parts of the code.
|
29 |
+
mocker.patch("submission._is_hf_acct_too_new", return_value=False)
|
30 |
+
mocker.patch("submission._is_last_submission_too_recent", return_value=False)
|
31 |
+
|
32 |
+
# We use this to find records corresponding to this test.
|
33 |
+
agent_description = f"CI run at {datetime.now().isoformat()}"
|
34 |
+
print(f"Using unique agent description: {agent_description}")
|
35 |
+
|
36 |
+
print("Submitting test submission...")
|
37 |
+
with open(os.path.join(os.path.dirname(__file__), "test-submission.tar.gz"), "rb") as f:
|
38 |
+
result = add_new_eval(
|
39 |
+
val_or_test="test",
|
40 |
+
agent_name="TestSubmissionIntegration",
|
41 |
+
agent_description=agent_description,
|
42 |
+
agent_url="https://github.com/allenai/asta-bench-leaderboard/blob/main/tests/integration/test_submission.py",
|
43 |
+
openness=CANONICAL_OPENNESS_CLOSED_UI_ONLY,
|
44 |
+
degree_of_control=CANONICAL_TOOL_USAGE_STANDARD,
|
45 |
+
path_to_file=f,
|
46 |
+
username="test_user",
|
47 |
+
role="Other",
|
48 |
+
email="[email protected]",
|
49 |
+
email_opt_in=True,
|
50 |
+
profile=gradio.OAuthProfile({
|
51 |
+
"name": "Test User",
|
52 |
+
"preferred_username": "test_user",
|
53 |
+
"profile": "test_user_profile",
|
54 |
+
"picture": "https://placecats.com/150/150",
|
55 |
+
}),
|
56 |
+
)
|
57 |
+
|
58 |
+
message, error_modal, success_modal, loading_modal = result
|
59 |
+
assert message == "" # Success
|
60 |
+
assert error_modal == {'__type__': 'update', 'visible': False}
|
61 |
+
assert success_modal == {'__type__': 'update', 'visible': True}
|
62 |
+
assert loading_modal == {'__type__': 'update', 'visible': False}
|
63 |
+
|
64 |
+
print("Looking up contact record...")
|
65 |
+
contacts = load_dataset(path=CONTACT_DATASET,
|
66 |
+
name=CONFIG_NAME,
|
67 |
+
download_mode="force_redownload",
|
68 |
+
verification_mode=VerificationMode.NO_CHECKS)
|
69 |
+
# There should have been a new entry due to this test with our unique description.
|
70 |
+
found_contact = next(row for row in contacts['test'] if row['agent_description'] == agent_description)
|
71 |
+
assert found_contact
|
72 |
+
|
73 |
+
# This contains an attribute that should lead us to files in the submissions dataset.
|
74 |
+
dataset_url = found_contact['dataset_url']
|
75 |
+
print(f"Found dataset URL: {dataset_url}")
|
76 |
+
assert dataset_url.startswith(
|
77 |
+
"hf://datasets/allenai/asta-bench-internal-submissions/continuous-integration/test/")
|
78 |
+
|
79 |
+
print("Checking submission dataset...")
|
80 |
+
# Commit message itself should link this and the contact record together unambiguously.
|
81 |
+
recent_commits = _hf.list_repo_commits(repo_type="dataset", repo_id=SUBMISSION_DATASET)
|
82 |
+
assert any(dataset_url in c.title for c in recent_commits)
|
83 |
+
|
84 |
+
print("Checking that files are present...")
|
85 |
+
rel_path = dataset_url[len("hf://datasets/allenai/asta-bench-internal-submissions/"):]
|
86 |
+
ds_info = _hf.dataset_info(SUBMISSION_DATASET)
|
87 |
+
# These are the files in our test-submission.tar.gz
|
88 |
+
assert any(f"{rel_path}/eval_config.json" == f.rfilename for f in ds_info.siblings)
|
89 |
+
assert any(f"{rel_path}/task_sqa_solver_openscilm.eval" == f.rfilename for f in ds_info.siblings)
|
90 |
+
# This is the generated metadata put into the dataset itself.
|
91 |
+
assert any(f"{rel_path}/submission.json" == f.rfilename for f in ds_info.siblings)
|
92 |
+
|
93 |
+
print("Checking contact record against submission.json...")
|
94 |
+
# Checks on contact record which is stored in a private dataset.
|
95 |
+
local_path = hf_hub_download(repo_type="dataset",
|
96 |
+
repo_id=SUBMISSION_DATASET,
|
97 |
+
filename=f"{rel_path}/submission.json")
|
98 |
+
with open(local_path) as f:
|
99 |
+
contact_from_json = json.load(f)
|
100 |
+
# Assert that all keys and values in submission.json are present in the contact record
|
101 |
+
for key, value_from_json in contact_from_json.items():
|
102 |
+
value_from_dataset = found_contact[key]
|
103 |
+
if isinstance(value_from_dataset, datetime):
|
104 |
+
value_from_dataset = found_contact[key].isoformat().replace('+00:00', 'Z')
|
105 |
+
assert value_from_dataset == value_from_json
|
106 |
+
# submission.json should not contain sensitive PII, specifically, email.
|
107 |
+
assert 'email' in found_contact
|
108 |
+
assert 'email' not in contact_from_json
|
109 |
+
# submission.json is defined by a specific data model.
|
110 |
+
SubmissionMetadata.model_validate(contact_from_json)
|