asta-bench-leaderboard

Running

App Files Files Community

Jason commited on 17 days ago

Commit

50aa233

unverified ·

1 Parent(s): ba77e6d

Jason/inttest and contact record improvements for reviewer (#97)

Browse files

Files changed (7) hide show

.gitattributes +1 -0
.github/workflows/integration-tests.yml +44 -0
app.py +0 -1
requirements-dev.txt +2 -0
submission.py +25 -22
tests/integration/test-submission.tar.gz +3 -0
tests/integration/test_submission.py +110 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ tests/integration/test-submission.tar.gz filter=lfs diff=lfs merge=lfs -text

.github/workflows/integration-tests.yml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: Integration Tests
+on:
+  pull_request:
+    branches: [ main ]
+jobs:
+  integration-test:
+    runs-on: ubuntu-latest
+    environment:
+      name: testing
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        lfs: true
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+    - name: Cache pip dependencies
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r requirements-dev.txt
+    - name: Run integration tests
+      run: |
+        pytest tests/integration/ -v --tb=short
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        HF_CONFIG: continuous-integration
+        IS_INTERNAL: true

app.py CHANGED Viewed

@@ -237,7 +237,6 @@ def restart_space_job():
 if __name__ == "__main__":
     if LOCAL_DEBUG:
         print("Launching in LOCAL_DEBUG mode.")
-        def get_initial_global_tag_choices(): return ["Overall"]
         demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
     else:
         print("Launching in Space mode.")

 if __name__ == "__main__":
     if LOCAL_DEBUG:
         print("Launching in LOCAL_DEBUG mode.")
         demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico")
     else:
         print("Launching in Space mode.")

requirements-dev.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 black
 isort

 black
 isort
+pytest~=8.4.1
+pytest-mock~=3.14.1

submission.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-import sys
 import matplotlib
 from agenteval.cli import SUBMISSION_METADATA_FILENAME
@@ -12,18 +12,11 @@ matplotlib.use('Agg')
 import os
 import shutil
 import tarfile
-import tempfile
 from datetime import datetime, timedelta, timezone
 from email.utils import parseaddr
-from pathlib import Path
 import gradio as gr
 import requests
-from agenteval import (
-    process_eval_logs,
-    upload_folder_to_hf,
-)
-from agenteval.leaderboard.models import LeaderboardSubmission
 from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
 from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
 from datasets.data_files import EmptyDatasetError
@@ -34,8 +27,6 @@ from config import (
     CONFIG_NAME,
     CONTACT_DATASET,
     EXTRACTED_DATA_DIR,
-    IS_INTERNAL,
-    LOCAL_DEBUG,
     RESULTS_DATASET,
     SUBMISSION_DATASET,
 )
@@ -90,6 +81,7 @@ def upload_submission(
     _validate_path_component(split, "split")
     _validate_path_component(submission_name, "submission_name")
     dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
     api.upload_folder(
         folder_path=folder_path,
         path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
@@ -111,7 +103,7 @@ def add_new_eval(
         agent_url: str,
         openness: str | None,
         degree_of_control: str | None,
-        path_to_file: tempfile._TemporaryFileWrapper | None,
         username: str,
         role: str,
         email: str,
@@ -173,13 +165,13 @@ def add_new_eval(
     logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
     contact_infos = try_load_dataset_submission(
         CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
-        verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
-    )
-    user_submission_dates = sorted(
-        datetime.fromisoformat(row["submit_time"])
-        for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
     )
-    if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
         logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
         return (
             format_error("You already submitted once in the last 24h for this split; please try again later."),  # error_message
@@ -262,7 +254,7 @@ def add_new_eval(
     logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
     try:
-        upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
     except ValueError as e:
         return (
             format_error(str(e)),                               # error_message
@@ -280,11 +272,11 @@ def add_new_eval(
     logger.info(f"agent {agent_name}: Save contact information")
     contact_info = subm_meta.model_dump()
-    contact_info["submit_time"] = submission_time.isoformat()
     contact_info["username_auth"] = profile.username
     contact_info["email"] = email
     contact_info["email_opt_in"] = email_opt_in
     contact_info["role"] = role
     logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
     if val_or_test in contact_infos:
@@ -293,7 +285,11 @@ def add_new_eval(
         contact_infos[val_or_test] = Dataset.from_list([contact_info])
     try:
-        contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
     except Exception as e:
         return (
             format_error(f"Submission recorded, but contact info failed to save: {e}"),  # error_message
@@ -304,7 +300,7 @@ def add_new_eval(
     logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
     return (
-        "",                                                 # error_message
         gr.update(visible=False),                           # error_modal
         gr.update(visible=True),                            # success_modal
         gr.update(visible=False)                            # loading_modal
@@ -319,6 +315,14 @@ def _is_hf_acct_too_new(submission_time: datetime, username: str):
     return submission_time - created_at < timedelta(days=60)
 openness_label_html = f"""<div>
     <b>Agent Openness</b>
     {build_openness_tooltip_content()}
@@ -331,7 +335,6 @@ agent_tooling_label_html = f"""<div>
 </div>"""
 heading_html = """
 <h2>🚀 Submit an agent for evaluation</h2>
 <p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>

 import logging
+import typing
 import matplotlib
 from agenteval.cli import SUBMISSION_METADATA_FILENAME
 import os
 import shutil
 import tarfile
 from datetime import datetime, timedelta, timezone
 from email.utils import parseaddr
 import gradio as gr
 import requests
 from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
 from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
 from datasets.data_files import EmptyDatasetError
     CONFIG_NAME,
     CONTACT_DATASET,
     EXTRACTED_DATA_DIR,
     RESULTS_DATASET,
     SUBMISSION_DATASET,
 )
     _validate_path_component(split, "split")
     _validate_path_component(submission_name, "submission_name")
     dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}"
+    logger.info(f"Uploading dataset {dataset_url}")
     api.upload_folder(
         folder_path=folder_path,
         path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}",
         agent_url: str,
         openness: str | None,
         degree_of_control: str | None,
+        path_to_file: typing.IO | None,
         username: str,
         role: str,
         email: str,
     logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
     contact_infos = try_load_dataset_submission(
         CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
+        verification_mode=VerificationMode.NO_CHECKS
     )
+    if _is_last_submission_too_recent(
+        contact_rows=contact_infos.get(val_or_test, []),
+        username=profile.username,
+        submission_time=submission_time,
+    ):
         logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
         return (
             format_error("You already submitted once in the last 24h for this split; please try again later."),  # error_message
     logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
     try:
+        dataset_url = upload_submission(extracted_dir, val_or_test, submission_name, profile.username)
     except ValueError as e:
         return (
             format_error(str(e)),                               # error_message
     logger.info(f"agent {agent_name}: Save contact information")
     contact_info = subm_meta.model_dump()
     contact_info["username_auth"] = profile.username
     contact_info["email"] = email
     contact_info["email_opt_in"] = email_opt_in
     contact_info["role"] = role
+    contact_info["dataset_url"] = dataset_url
     logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
     if val_or_test in contact_infos:
         contact_infos[val_or_test] = Dataset.from_list([contact_info])
     try:
+        contact_infos.push_to_hub(
+            repo_id=CONTACT_DATASET,
+            config_name=CONFIG_NAME,
+            commit_message=f'Submission from hf user "{profile.username}" to "{dataset_url}"',
+        )
     except Exception as e:
         return (
             format_error(f"Submission recorded, but contact info failed to save: {e}"),  # error_message
     logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
     return (
+        "",                                                 # message
         gr.update(visible=False),                           # error_modal
         gr.update(visible=True),                            # success_modal
         gr.update(visible=False)                            # loading_modal
     return submission_time - created_at < timedelta(days=60)
+def _is_last_submission_too_recent(contact_rows, username, submission_time):
+    user_submission_dates = sorted(
+        datetime.fromisoformat(row["submit_time"])
+        for row in contact_rows if row["username_auth"] == username
+    )
+    return user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1))
 openness_label_html = f"""<div>
     <b>Agent Openness</b>
     {build_openness_tooltip_content()}
 </div>"""
 heading_html = """
 <h2>🚀 Submit an agent for evaluation</h2>
 <p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>

tests/integration/test-submission.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34e9cba8a8431af3323b2dc8ce639f0dc058b4493c08eb65dbf20fe6936a27e8
+size 8187172

tests/integration/test_submission.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import json
+import os
+from datetime import datetime
+import gradio
+import pytest
+import pyarrow as pa
+from agenteval.models import SubmissionMetadata
+from datasets import load_dataset, VerificationMode
+from huggingface_hub import HfApi, hf_hub_download
+from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
+from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
+from submission import add_new_eval
+_hf = HfApi()
+class TestSubmission:
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        # These need to be set before imports are evaluated so all we can do here
+        # is check that they have been set correctly.
+        assert IS_INTERNAL == True
+        assert CONFIG_NAME == "continuous-integration"
+    def test_add_new_eval(self, mocker):
+        # Bypass some checks so that the test can cover later parts of the code.
+        mocker.patch("submission._is_hf_acct_too_new", return_value=False)
+        mocker.patch("submission._is_last_submission_too_recent", return_value=False)
+        # We use this to find records corresponding to this test.
+        agent_description = f"CI run at {datetime.now().isoformat()}"
+        print(f"Using unique agent description: {agent_description}")
+        print("Submitting test submission...")
+        with open(os.path.join(os.path.dirname(__file__), "test-submission.tar.gz"), "rb") as f:
+            result = add_new_eval(
+                val_or_test="test",
+                agent_name="TestSubmissionIntegration",
+                agent_description=agent_description,
+                agent_url="https://github.com/allenai/asta-bench-leaderboard/blob/main/tests/integration/test_submission.py",
+                openness=CANONICAL_OPENNESS_CLOSED_UI_ONLY,
+                degree_of_control=CANONICAL_TOOL_USAGE_STANDARD,
+                path_to_file=f,
+                username="test_user",
+                role="Other",
+                email="[email protected]",
+                email_opt_in=True,
+                profile=gradio.OAuthProfile({
+                    "name": "Test User",
+                    "preferred_username": "test_user",
+                    "profile": "test_user_profile",
+                    "picture": "https://placecats.com/150/150",
+                }),
+            )
+        message, error_modal, success_modal, loading_modal = result
+        assert message == ""  # Success
+        assert error_modal == {'__type__': 'update', 'visible': False}
+        assert success_modal == {'__type__': 'update', 'visible': True}
+        assert loading_modal == {'__type__': 'update', 'visible': False}
+        print("Looking up contact record...")
+        contacts = load_dataset(path=CONTACT_DATASET,
+                                name=CONFIG_NAME,
+                                download_mode="force_redownload",
+                                verification_mode=VerificationMode.NO_CHECKS)
+        # There should have been a new entry due to this test with our unique description.
+        found_contact = next(row for row in contacts['test'] if row['agent_description'] == agent_description)
+        assert found_contact
+        # This contains an attribute that should lead us to files in the submissions dataset.
+        dataset_url = found_contact['dataset_url']
+        print(f"Found dataset URL: {dataset_url}")
+        assert dataset_url.startswith(
+            "hf://datasets/allenai/asta-bench-internal-submissions/continuous-integration/test/")
+        print("Checking submission dataset...")
+        # Commit message itself should link this and the contact record together unambiguously.
+        recent_commits = _hf.list_repo_commits(repo_type="dataset", repo_id=SUBMISSION_DATASET)
+        assert any(dataset_url in c.title for c in recent_commits)
+        print("Checking that files are present...")
+        rel_path = dataset_url[len("hf://datasets/allenai/asta-bench-internal-submissions/"):]
+        ds_info = _hf.dataset_info(SUBMISSION_DATASET)
+        # These are the files in our test-submission.tar.gz
+        assert any(f"{rel_path}/eval_config.json" == f.rfilename for f in ds_info.siblings)
+        assert any(f"{rel_path}/task_sqa_solver_openscilm.eval" == f.rfilename for f in ds_info.siblings)
+        # This is the generated metadata put into the dataset itself.
+        assert any(f"{rel_path}/submission.json" == f.rfilename for f in ds_info.siblings)
+        print("Checking contact record against submission.json...")
+        # Checks on contact record which is stored in a private dataset.
+        local_path = hf_hub_download(repo_type="dataset",
+                                     repo_id=SUBMISSION_DATASET,
+                                     filename=f"{rel_path}/submission.json")
+        with open(local_path) as f:
+            contact_from_json = json.load(f)
+        # Assert that all keys and values in submission.json are present in the contact record
+        for key, value_from_json in contact_from_json.items():
+            value_from_dataset = found_contact[key]
+            if isinstance(value_from_dataset, datetime):
+                value_from_dataset = found_contact[key].isoformat().replace('+00:00', 'Z')
+            assert value_from_dataset == value_from_json
+        # submission.json should not contain sensitive PII, specifically, email.
+        assert 'email' in found_contact
+        assert 'email' not in contact_from_json
+        # submission.json is defined by a specific data model.
+        SubmissionMetadata.model_validate(contact_from_json)