Jason commited on
Commit
50aa233
·
unverified ·
1 Parent(s): ba77e6d

Jason/inttest and contact record improvements for reviewer (#97)

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ tests/integration/test-submission.tar.gz filter=lfs diff=lfs merge=lfs -text
.github/workflows/integration-tests.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Integration Tests
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [ main ]
6
+
7
+ jobs:
8
+ integration-test:
9
+ runs-on: ubuntu-latest
10
+
11
+ environment:
12
+ name: testing
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ with:
17
+ lfs: true
18
+
19
+ - name: Set up Python 3.11
20
+ uses: actions/setup-python@v4
21
+ with:
22
+ python-version: '3.11'
23
+
24
+ - name: Cache pip dependencies
25
+ uses: actions/cache@v3
26
+ with:
27
+ path: ~/.cache/pip
28
+ key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
29
+ restore-keys: |
30
+ ${{ runner.os }}-pip-
31
+
32
+ - name: Install dependencies
33
+ run: |
34
+ python -m pip install --upgrade pip
35
+ pip install -r requirements.txt
36
+ pip install -r requirements-dev.txt
37
+
38
+ - name: Run integration tests
39
+ run: |
40
+ pytest tests/integration/ -v --tb=short
41
+ env:
42
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
43
+ HF_CONFIG: continuous-integration
44
+ IS_INTERNAL: true
app.py CHANGED
@@ -237,7 +237,6 @@ def restart_space_job():
237
  if __name__ == "__main__":
238
  if LOCAL_DEBUG:
239
  print("Launching in LOCAL_DEBUG mode.")
240
- def get_initial_global_tag_choices(): return ["Overall"]
241
  demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
242
  else:
243
  print("Launching in Space mode.")
 
237
  if __name__ == "__main__":
238
  if LOCAL_DEBUG:
239
  print("Launching in LOCAL_DEBUG mode.")
 
240
  demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
241
  else:
242
  print("Launching in Space mode.")
requirements-dev.txt CHANGED
@@ -1,2 +1,4 @@
1
  black
2
  isort
 
 
 
1
  black
2
  isort
3
+ pytest~=8.4.1
4
+ pytest-mock~=3.14.1
submission.py CHANGED
@@ -1,5 +1,5 @@
1
  import logging
2
- import sys
3
 
4
  import matplotlib
5
  from agenteval.cli import SUBMISSION_METADATA_FILENAME
@@ -12,18 +12,11 @@ matplotlib.use('Agg')
12
  import os
13
  import shutil
14
  import tarfile
15
- import tempfile
16
  from datetime import datetime, timedelta, timezone
17
  from email.utils import parseaddr
18
- from pathlib import Path
19
 
20
  import gradio as gr
21
  import requests
22
- from agenteval import (
23
- process_eval_logs,
24
- upload_folder_to_hf,
25
- )
26
- from agenteval.leaderboard.models import LeaderboardSubmission
27
  from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
28
  from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
29
  from datasets.data_files import EmptyDatasetError
@@ -34,8 +27,6 @@ from config import (
34
  CONFIG_NAME,
35
  CONTACT_DATASET,
36
  EXTRACTED_DATA_DIR,
37
- IS_INTERNAL,
38
- LOCAL_DEBUG,
39
  RESULTS_DATASET,
40
  SUBMISSION_DATASET,
41
  )
@@ -90,6 +81,7 @@ def upload_submission(
90
  _validate_path_component(split, "split")
91
  _validate_path_component(submission_name, "submission_name")
92
  dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
 
93
  api.upload_folder(
94
  folder_path=folder_path,
95
  path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
@@ -111,7 +103,7 @@ def add_new_eval(
111
  agent_url: str,
112
  openness: str | None,
113
  degree_of_control: str | None,
114
- path_to_file: tempfile._TemporaryFileWrapper | None,
115
  username: str,
116
  role: str,
117
  email: str,
@@ -173,13 +165,13 @@ def add_new_eval(
173
  logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
174
  contact_infos = try_load_dataset_submission(
175
  CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
176
- verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
177
- )
178
- user_submission_dates = sorted(
179
- datetime.fromisoformat(row["submit_time"])
180
- for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
181
  )
182
- if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
 
 
 
 
183
  logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
184
  return (
185
  format_error("You already submitted once in the last 24h for this split; please try again later."), # error_message
@@ -262,7 +254,7 @@ def add_new_eval(
262
 
263
  logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
264
  try:
265
- upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
266
  except ValueError as e:
267
  return (
268
  format_error(str(e)), # error_message
@@ -280,11 +272,11 @@ def add_new_eval(
280
 
281
  logger.info(f"agent {agent_name}: Save contact information")
282
  contact_info = subm_meta.model_dump()
283
- contact_info["submit_time"] = submission_time.isoformat()
284
  contact_info["username_auth"] = profile.username
285
  contact_info["email"] = email
286
  contact_info["email_opt_in"] = email_opt_in
287
  contact_info["role"] = role
 
288
 
289
  logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
290
  if val_or_test in contact_infos:
@@ -293,7 +285,11 @@ def add_new_eval(
293
  contact_infos[val_or_test] = Dataset.from_list([contact_info])
294
 
295
  try:
296
- contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
 
 
 
 
297
  except Exception as e:
298
  return (
299
  format_error(f"Submission recorded, but contact info failed to save: {e}"), # error_message
@@ -304,7 +300,7 @@ def add_new_eval(
304
 
305
  logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
306
  return (
307
- "", # error_message
308
  gr.update(visible=False), # error_modal
309
  gr.update(visible=True), # success_modal
310
  gr.update(visible=False) # loading_modal
@@ -319,6 +315,14 @@ def _is_hf_acct_too_new(submission_time: datetime, username: str):
319
  return submission_time - created_at < timedelta(days=60)
320
 
321
 
 
 
 
 
 
 
 
 
322
  openness_label_html = f"""<div>
323
  <b>Agent Openness</b>
324
  {build_openness_tooltip_content()}
@@ -331,7 +335,6 @@ agent_tooling_label_html = f"""<div>
331
  </div>"""
332
 
333
 
334
-
335
  heading_html = """
336
  <h2>🚀 Submit an agent for evaluation</h2>
337
  <p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>
 
1
  import logging
2
+ import typing
3
 
4
  import matplotlib
5
  from agenteval.cli import SUBMISSION_METADATA_FILENAME
 
12
  import os
13
  import shutil
14
  import tarfile
 
15
  from datetime import datetime, timedelta, timezone
16
  from email.utils import parseaddr
 
17
 
18
  import gradio as gr
19
  import requests
 
 
 
 
 
20
  from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
21
  from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
22
  from datasets.data_files import EmptyDatasetError
 
27
  CONFIG_NAME,
28
  CONTACT_DATASET,
29
  EXTRACTED_DATA_DIR,
 
 
30
  RESULTS_DATASET,
31
  SUBMISSION_DATASET,
32
  )
 
81
  _validate_path_component(split, "split")
82
  _validate_path_component(submission_name, "submission_name")
83
  dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
84
+ logger.info(f"Uploading dataset {dataset_url}")
85
  api.upload_folder(
86
  folder_path=folder_path,
87
  path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
 
103
  agent_url: str,
104
  openness: str | None,
105
  degree_of_control: str | None,
106
+ path_to_file: typing.IO | None,
107
  username: str,
108
  role: str,
109
  email: str,
 
165
  logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
166
  contact_infos = try_load_dataset_submission(
167
  CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
168
+ verification_mode=VerificationMode.NO_CHECKS
 
 
 
 
169
  )
170
+ if _is_last_submission_too_recent(
171
+ contact_rows=contact_infos.get(val_or_test, []),
172
+ username=profile.username,
173
+ submission_time=submission_time,
174
+ ):
175
  logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
176
  return (
177
  format_error("You already submitted once in the last 24h for this split; please try again later."), # error_message
 
254
 
255
  logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
256
  try:
257
+ dataset_url = upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
258
  except ValueError as e:
259
  return (
260
  format_error(str(e)), # error_message
 
272
 
273
  logger.info(f"agent {agent_name}: Save contact information")
274
  contact_info = subm_meta.model_dump()
 
275
  contact_info["username_auth"] = profile.username
276
  contact_info["email"] = email
277
  contact_info["email_opt_in"] = email_opt_in
278
  contact_info["role"] = role
279
+ contact_info["dataset_url"] = dataset_url
280
 
281
  logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
282
  if val_or_test in contact_infos:
 
285
  contact_infos[val_or_test] = Dataset.from_list([contact_info])
286
 
287
  try:
288
+ contact_infos.push_to_hub(
289
+ repo_id=CONTACT_DATASET,
290
+ config_name=CONFIG_NAME,
291
+ commit_message=f'Submission from hf user "{profile.username}" to "{dataset_url}"',
292
+ )
293
  except Exception as e:
294
  return (
295
  format_error(f"Submission recorded, but contact info failed to save: {e}"), # error_message
 
300
 
301
  logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
302
  return (
303
+ "", # message
304
  gr.update(visible=False), # error_modal
305
  gr.update(visible=True), # success_modal
306
  gr.update(visible=False) # loading_modal
 
315
  return submission_time - created_at < timedelta(days=60)
316
 
317
 
318
+ def _is_last_submission_too_recent(contact_rows, username, submission_time):
319
+ user_submission_dates = sorted(
320
+ datetime.fromisoformat(row["submit_time"])
321
+ for row in contact_rows if row["username_auth"] == username
322
+ )
323
+ return user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1))
324
+
325
+
326
  openness_label_html = f"""<div>
327
  <b>Agent Openness</b>
328
  {build_openness_tooltip_content()}
 
335
  </div>"""
336
 
337
 
 
338
  heading_html = """
339
  <h2>🚀 Submit an agent for evaluation</h2>
340
  <p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>
tests/integration/test-submission.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e9cba8a8431af3323b2dc8ce639f0dc058b4493c08eb65dbf20fe6936a27e8
3
+ size 8187172
tests/integration/test_submission.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+
5
+ import gradio
6
+ import pytest
7
+ import pyarrow as pa
8
+ from agenteval.models import SubmissionMetadata
9
+ from datasets import load_dataset, VerificationMode
10
+ from huggingface_hub import HfApi, hf_hub_download
11
+
12
+ from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
13
+ from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
14
+ from submission import add_new_eval
15
+
16
+ _hf = HfApi()
17
+
18
+
19
+ class TestSubmission:
20
+ @pytest.fixture(autouse=True)
21
+ def setup(self):
22
+ # These need to be set before imports are evaluated so all we can do here
23
+ # is check that they have been set correctly.
24
+ assert IS_INTERNAL == True
25
+ assert CONFIG_NAME == "continuous-integration"
26
+
27
+ def test_add_new_eval(self, mocker):
28
+ # Bypass some checks so that the test can cover later parts of the code.
29
+ mocker.patch("submission._is_hf_acct_too_new", return_value=False)
30
+ mocker.patch("submission._is_last_submission_too_recent", return_value=False)
31
+
32
+ # We use this to find records corresponding to this test.
33
+ agent_description = f"CI run at {datetime.now().isoformat()}"
34
+ print(f"Using unique agent description: {agent_description}")
35
+
36
+ print("Submitting test submission...")
37
+ with open(os.path.join(os.path.dirname(__file__), "test-submission.tar.gz"), "rb") as f:
38
+ result = add_new_eval(
39
+ val_or_test="test",
40
+ agent_name="TestSubmissionIntegration",
41
+ agent_description=agent_description,
42
+ agent_url="https://github.com/allenai/asta-bench-leaderboard/blob/main/tests/integration/test_submission.py",
43
+ openness=CANONICAL_OPENNESS_CLOSED_UI_ONLY,
44
+ degree_of_control=CANONICAL_TOOL_USAGE_STANDARD,
45
+ path_to_file=f,
46
+ username="test_user",
47
+ role="Other",
48
+ email="[email protected]",
49
+ email_opt_in=True,
50
+ profile=gradio.OAuthProfile({
51
+ "name": "Test User",
52
+ "preferred_username": "test_user",
53
+ "profile": "test_user_profile",
54
+ "picture": "https://placecats.com/150/150",
55
+ }),
56
+ )
57
+
58
+ message, error_modal, success_modal, loading_modal = result
59
+ assert message == "" # Success
60
+ assert error_modal == {'__type__': 'update', 'visible': False}
61
+ assert success_modal == {'__type__': 'update', 'visible': True}
62
+ assert loading_modal == {'__type__': 'update', 'visible': False}
63
+
64
+ print("Looking up contact record...")
65
+ contacts = load_dataset(path=CONTACT_DATASET,
66
+ name=CONFIG_NAME,
67
+ download_mode="force_redownload",
68
+ verification_mode=VerificationMode.NO_CHECKS)
69
+ # There should have been a new entry due to this test with our unique description.
70
+ found_contact = next(row for row in contacts['test'] if row['agent_description'] == agent_description)
71
+ assert found_contact
72
+
73
+ # This contains an attribute that should lead us to files in the submissions dataset.
74
+ dataset_url = found_contact['dataset_url']
75
+ print(f"Found dataset URL: {dataset_url}")
76
+ assert dataset_url.startswith(
77
+ "hf://datasets/allenai/asta-bench-internal-submissions/continuous-integration/test/")
78
+
79
+ print("Checking submission dataset...")
80
+ # Commit message itself should link this and the contact record together unambiguously.
81
+ recent_commits = _hf.list_repo_commits(repo_type="dataset", repo_id=SUBMISSION_DATASET)
82
+ assert any(dataset_url in c.title for c in recent_commits)
83
+
84
+ print("Checking that files are present...")
85
+ rel_path = dataset_url[len("hf://datasets/allenai/asta-bench-internal-submissions/"):]
86
+ ds_info = _hf.dataset_info(SUBMISSION_DATASET)
87
+ # These are the files in our test-submission.tar.gz
88
+ assert any(f"{rel_path}/eval_config.json" == f.rfilename for f in ds_info.siblings)
89
+ assert any(f"{rel_path}/task_sqa_solver_openscilm.eval" == f.rfilename for f in ds_info.siblings)
90
+ # This is the generated metadata put into the dataset itself.
91
+ assert any(f"{rel_path}/submission.json" == f.rfilename for f in ds_info.siblings)
92
+
93
+ print("Checking contact record against submission.json...")
94
+ # Checks on contact record which is stored in a private dataset.
95
+ local_path = hf_hub_download(repo_type="dataset",
96
+ repo_id=SUBMISSION_DATASET,
97
+ filename=f"{rel_path}/submission.json")
98
+ with open(local_path) as f:
99
+ contact_from_json = json.load(f)
100
+ # Assert that all keys and values in submission.json are present in the contact record
101
+ for key, value_from_json in contact_from_json.items():
102
+ value_from_dataset = found_contact[key]
103
+ if isinstance(value_from_dataset, datetime):
104
+ value_from_dataset = found_contact[key].isoformat().replace('+00:00', 'Z')
105
+ assert value_from_dataset == value_from_json
106
+ # submission.json should not contain sensitive PII, specifically, email.
107
+ assert 'email' in found_contact
108
+ assert 'email' not in contact_from_json
109
+ # submission.json is defined by a specific data model.
110
+ SubmissionMetadata.model_validate(contact_from_json)