Spaces:

dirkraft
/

fuhgedaboudit

Paused

App Files Files Community

jbragg commited on Apr 25

Commit

5ef407f

unverified ·

1 Parent(s): f82c430

Leaderboard (#2)

Browse files

Files changed (7) hide show

.gitignore +181 -0
Dockerfile +44 -0
README.md +15 -0
app.py +644 -0
content.py +55 -0
requirements-dev.txt +2 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,181 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+# Vim files
+*.swp
+*.swo
+*.un~
+# Misc
+.DS_Store
+.mise.toml
+.vscode/

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+FROM python:3.10-slim
+# (0) Install SSH client tools (and git, if you're pulling via SSH)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends openssh-client git && \
+    rm -rf /var/lib/apt/lists/*
+# The two following lines are requirements for the Dev Mode to be functional
+# Learn more about the Dev Mode at https://huggingface.co/dev-mode-explorers
+RUN useradd -m -u 1000 user
+WORKDIR /app
+# (2) Copy dependencies manifest
+COPY --chown=user requirements.txt requirements.txt
+# (3) Install dependencies, mounting SSH keys and optional HTTPS creds
+RUN --mount=type=secret,id=AGENTEVAL_DEPLOY_KEY,mode=0400,required=true \
+    --mount=type=secret,id=ASTABENCH_DEPLOY_KEY,mode=0400,required=true \
+    mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
+    cat /run/secrets/AGENTEVAL_DEPLOY_KEY > /root/.ssh/id_ed25519 && chmod 600 /root/.ssh/id_ed25519 && \
+    cat /run/secrets/ASTABENCH_DEPLOY_KEY > /root/.ssh/id_astabench && chmod 600 /root/.ssh/id_astabench && \
+    ssh-keyscan github.com >> /root/.ssh/known_hosts && \
+    printf 'Host github.com\n  User git\n  IdentityFile /root/.ssh/id_ed25519\n  IdentityFile /root/.ssh/id_astabench\n  StrictHostKeyChecking no\n' >> /root/.ssh/config && \
+    # rewrite all GitHub HTTPS URLs to SSH so nested deps install via SSH
+    git config --global url."ssh://[email protected]/".insteadOf "https://github.com/" && \
+    pip install --no-cache-dir --upgrade -r requirements.txt
+# (4) Copy in your Gradio app code
+COPY . .
+RUN mkdir -p /home/user/data && chown -R user:user /home/user/data
+# Make the app treat this as non‑debug (so DATA_DIR=/home/user/data)
+ENV system=spaces
+# (5) Switch to a non-root user
+USER user
+# (6) Expose Gradio’s default port
+EXPOSE 7860
+# (7) Launch your app
+CMD ["python", "app.py"]

README.md CHANGED Viewed

	@@ -0,0 +1,15 @@

+---
+title: AstaBench Leaderboard
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
+sdk: docker
+app_file: app.py
+pinned: true
+license: apache-2.0
+hf_oauth: true
+app_port: 7860
+failure_strategy: none
+tags:
+  - leaderboard
+---

app.py ADDED Viewed

	@@ -0,0 +1,644 @@

+"""app.py: Gradio app for the AstaBench leaderboard.
+Modeled after the GAIA huggingface leaderboard app.
+"""
+import json
+import os
+import shutil
+import tarfile
+import tempfile
+from datetime import datetime, timedelta, timezone
+from email.utils import parseaddr
+from pathlib import Path
+from zoneinfo import ZoneInfo
+import gradio as gr
+import numpy as np
+import pandas as pd
+import requests
+from agenteval import (
+    compute_summary_statistics,
+    process_eval_logs,
+    upload_folder_to_hf,
+    upload_summary_to_hf,
+)
+from agenteval.models import EvalResult
+from agenteval.upload import sanitize_path_component
+from apscheduler.schedulers.background import BackgroundScheduler
+from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
+from datasets.data_files import EmptyDatasetError
+from huggingface_hub import HfApi
+from content import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    INTRODUCTION_TEXT,
+    SUBMISSION_TEXT,
+    TITLE,
+    format_error,
+    format_log,
+    format_warning,
+    hf_uri_to_web_url,
+    hyperlink,
+)
+# Should be False on spaces and True outside
+LOCAL_DEBUG = not (os.environ.get("system") == "spaces")
+CONFIG_NAME = "1.0.0-dev1"
+IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true"
+OWNER = "allenai"
+PROJECT_NAME = "asta-bench" + ("-internal" if IS_INTERNAL else "")
+SUBMISSION_DATASET = f"{OWNER}/{PROJECT_NAME}-submissions"  # all raw and scored submissions (val and test)
+SUBMISSION_DATASET_PUBLIC = f"{OWNER}/{PROJECT_NAME}-submissions-public"  # copy scored val submissions (public for transparency - not used for rendering leaderboard)
+CONTACT_DATASET = f"{OWNER}/{PROJECT_NAME}-contact-info"
+RESULTS_DATASET = f"{OWNER}/{PROJECT_NAME}-results"  # just the summary score statistics (val and test), to be displayed on the leaderboard
+LEADERBOARD_PATH = f"{OWNER}/{PROJECT_NAME}-leaderboard"
+if LOCAL_DEBUG:
+    DATA_DIR = os.path.join(os.path.dirname(__file__), "data", CONFIG_NAME)
+else:
+    DATA_DIR = "/home/user/data/" + CONFIG_NAME
+EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted")
+api = HfApi()
+# max upload size of 100MB
+MAX_UPLOAD_BYTES = 100 * 1024**2
+AGENTEVAL_MANIFEST_NAME = "agenteval.json"
+os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
+def try_load_dataset(*args, **kwargs) -> DatasetDict:
+    try:
+        return load_dataset(*args, **kwargs)
+    except EmptyDatasetError:
+        return DatasetDict()
+    except ValueError:
+        return DatasetDict()
+def pretty_column_name(col: str) -> str:
+    """Map any raw column name to its display name."""
+    # text columns
+    if col == "submit_time":
+        return "Submission date"
+    elif col == "agent_name":
+        return "Agent"
+    elif col == "agent_description":
+        return "Agent description"
+    elif col == "username":
+        return "User/organization"
+    elif col == "logs_url":
+        return "Logs"
+    # cost → $
+    if col.endswith("/cost"):
+        return "$"
+    # stderr → CI
+    elif col.endswith("/cost_stderr") or col.endswith("/score_stderr"):
+        return "CI"
+    # overall score
+    elif col == "overall/score":
+        return "Overall"
+    # any other score → its tag/task name
+    elif col.endswith("/score"):
+        return col.split("/")[1]
+    # fallback to unchanged
+    return col
+def get_dataframe_from_results(eval_results: DatasetDict, split: str):
+    local_df = eval_results.get(split)
+    # return default if split is missing or contains no records
+    if local_df is None or len(local_df) == 0:
+        default_raw_cols = [
+            "agent_name",
+            "agent_description",
+            "username",
+            "submit_time",
+        ]
+        pretty_cols = [pretty_column_name(c) for c in default_raw_cols]
+        return pd.DataFrame({col: ["No data"] for col in pretty_cols})
+    # Use the first suite_config for all rows
+    # because the suite_config should not change given a single CONFIG_NAME
+    first_suite_config = None
+    if len(local_df) > 0:
+        first_suite_config = EvalResult.model_validate(local_df[0]).suite_config
+    def extract_scores(eval_res: EvalResult) -> dict[str, float | None]:
+        summary_stats = compute_summary_statistics(
+            suite_config=first_suite_config,
+            split=split,
+            results=eval_res.results,
+        )
+        values: dict[str, float | None] = {}
+        for key in summary_stats:
+            if key == "overall":
+                values["overall/score"] = summary_stats[key].score
+                values["overall/cost"] = summary_stats[key].cost
+            elif key.startswith("tag/"):
+                tag = key.split("/")[1]
+                values[f"tag/{tag}/score"] = summary_stats[key].score
+                values[f"tag/{tag}/cost"] = summary_stats[key].cost
+            elif key.startswith("task/"):
+                task = key.split("/")[1]
+                values[f"task/{task}/score"] = summary_stats[key].score
+                values[f"task/{task}/score_stderr"] = summary_stats[key].score_stderr
+                values[f"task/{task}/cost"] = summary_stats[key].cost
+                values[f"task/{task}/cost_stderr"] = summary_stats[key].cost_stderr
+        return values
+    def format_row(row) -> dict[str, float | str | None]:
+        eval_res = EvalResult.model_validate(row)
+        sub = eval_res.submission
+        sub.submit_time = sub.submit_time or datetime(1970, 1, 1, 0, 0, 0)
+        data = {
+            "submit_time": sub.submit_time.astimezone(ZoneInfo("US/Pacific")).strftime(
+                "%Y-%m-%d"
+            ),
+            "agent_name": (
+                hyperlink(sub.agent_url, sub.agent_name)
+                if sub.agent_url
+                else sub.agent_name
+            ),
+            "agent_description": sub.agent_description or "",
+            "username": sub.username or "",
+            **extract_scores(eval_res),
+            "logs_url": (
+                hyperlink(
+                    hf_uri_to_web_url(
+                        sub.logs_url if IS_INTERNAL else sub.logs_url_public
+                    ),
+                    "🔗",
+                )
+                if (sub.logs_url or sub.logs_url_public)
+                else ""
+            ),
+        }
+        return data
+    local_df = local_df.map(format_row)
+    df = pd.DataFrame(local_df)
+    # Multiply score, cost, and stderr values by 100 and round to 1 decimal
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    df[numeric_cols] = df[numeric_cols].multiply(100).round(1)
+    # Build column order on raw names, then rename via pretty_column_name
+    all_cols = df.columns.tolist()
+    base = ["agent_name", "agent_description", "username"]
+    overall = ["overall/score", "overall/cost"]
+    tags = sorted({c.split("/")[1] for c in all_cols if c.startswith("tag/")})
+    tasks = sorted({c.split("/")[1] for c in all_cols if c.startswith("task/")})
+    rest = ["submit_time", "logs_url"]
+    column_order = (
+        base
+        + overall
+        + [col for tag in tags for col in (f"tag/{tag}/score", f"tag/{tag}/cost")]
+        + [
+            col
+            for t in tasks
+            for col in (
+                f"task/{t}/score",
+                f"task/{t}/score_stderr",
+                f"task/{t}/cost",
+                f"task/{t}/cost_stderr",
+            )
+        ]
+        + rest
+    )
+    df = df.reindex(columns=[c for c in column_order if c in all_cols])
+    # sort by overall score (descending)
+    df = df.sort_values(by=["overall/score"], ascending=False)
+    # apply all renames via pretty_column_name
+    orig_cols = df.columns.tolist()
+    df.columns = [pretty_column_name(col) for col in orig_cols]
+    # blank out any null/NaN cells
+    df = df.fillna("")
+    return df
+def load_and_format_dataframes():
+    eval_results = try_load_dataset(
+        RESULTS_DATASET,
+        CONFIG_NAME,
+        download_mode="force_redownload",
+        verification_mode=VerificationMode.NO_CHECKS,
+        trust_remote_code=True,
+    )
+    eval_dataframe_val = get_dataframe_from_results(
+        eval_results=eval_results, split="validation"
+    )
+    eval_dataframe_test = get_dataframe_from_results(
+        eval_results=eval_results, split="test"
+    )
+    return eval_results, eval_dataframe_val, eval_dataframe_test
+# Display the results
+eval_results, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH)
+def checked_upload_folder(
+    api,
+    folder_path: str,
+    repo_id: str,
+    config_name: str,
+    split: str,
+    submission_name: str,
+) -> str:
+    """Upload with inline size check; raises ValueError if too large."""
+    total = 0
+    for root, _, files in os.walk(folder_path):
+        for f in files:
+            total += os.path.getsize(os.path.join(root, f))
+            if total > MAX_UPLOAD_BYTES:
+                raise ValueError(
+                    f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
+                )
+    # NOTE: This function raises ValueError if unsafe characters are found in the path.
+    return upload_folder_to_hf(
+        api=api,
+        folder_path=folder_path,
+        repo_id=repo_id,
+        config_name=config_name,
+        split=split,
+        submission_name=submission_name,
+    )
+def add_new_eval(
+    val_or_test: str,
+    agent_name: str | None,
+    agent_description: str,
+    agent_url: str,
+    path_to_file: tempfile._TemporaryFileWrapper | None,
+    username: str,
+    mail: str,
+    profile: gr.OAuthProfile,
+):
+    # default username if none provided
+    if not username or username.strip() == "":
+        username = profile.username
+    if not agent_name:
+        return format_warning("Please provide an agent name.")
+    submission_time = datetime.now(timezone.utc)
+    # Was the profile created less than 2 month ago?
+    user_data = requests.get(
+        f"https://huggingface.co/api/users/{profile.username}/overview"
+    )
+    creation_date = json.loads(user_data.content)["createdAt"]
+    created_at = datetime.strptime(creation_date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
+        tzinfo=timezone.utc
+    )
+    if submission_time - created_at < timedelta(days=60):
+        return format_error("This account is not authorized to submit here.")
+    contact_infos = try_load_dataset(
+        CONTACT_DATASET,
+        CONFIG_NAME,
+        download_mode="force_redownload",
+        verification_mode=VerificationMode.NO_CHECKS,
+        trust_remote_code=True,
+    )
+    user_submission_dates = sorted(
+        datetime.fromisoformat(row["submit_time"])
+        for row in contact_infos.get(val_or_test, [])
+        if row["username_auth"] == profile.username
+    )
+    if len(user_submission_dates) > 0 and abs(
+        submission_time - user_submission_dates[-1]
+    ) < timedelta(seconds=24 * 60 * 60):
+        return format_error(
+            "You already submitted once in the last 24h; please try again later."
+        )
+    is_validation = val_or_test == "validation"
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if "@" not in parsed_mail:
+        return format_warning("Please provide a valid email adress.")
+    # Check duplicate submissions by inspecting the nested "submission" dicts
+    if val_or_test in eval_results and len(eval_results[val_or_test]) > 0:
+        existing = eval_results[val_or_test]
+        subs = existing.to_dict().get("submission", [])
+        names = {item.get("agent_name", "").lower() for item in subs}
+        users = {item.get("username", "").lower() for item in subs}
+        if agent_name.lower() in names and username.lower() in users:
+            return format_warning("This agent has been already submitted.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    # sanitize username and agent_name for filesystem
+    safe_username = sanitize_path_component(username)
+    safe_agent_name = sanitize_path_component(agent_name)
+    extracted_dir = os.path.join(
+        EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}"
+    )
+    if LOCAL_DEBUG:
+        print("mock extracted file", flush=True)
+    else:
+        try:
+            # 1) remove old extraction if present
+            if os.path.exists(extracted_dir):
+                shutil.rmtree(extracted_dir)
+            os.makedirs(extracted_dir, exist_ok=True)
+            # 2) securely extract only regular files, flatten structure
+            # Flatten structure to aid finding the manifest agenteval.json file
+            # and because hierarchical structure is not needed
+            with tarfile.open(path_to_file.name, "r:gz") as tar:
+                for member in tar.getmembers():
+                    if not member.isreg():
+                        continue
+                    fname = os.path.basename(member.name)
+                    # skip empty or hidden
+                    if not fname or fname.startswith("."):
+                        continue
+                    fobj = tar.extractfile(member)
+                    if not fobj:
+                        continue
+                    target = os.path.join(extracted_dir, fname)
+                    with open(target, "wb") as out:
+                        out.write(fobj.read())
+            # 3) ensure something was extracted
+            if not os.listdir(extracted_dir):
+                return format_error("Submission tarball is empty or invalid.")
+        except Exception as e:
+            return format_error(
+                f"Error while extracting the file: {e}. Be sure to upload a valid .tar.gz file."
+            )
+    submission_name = (
+        f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d')}"
+    )
+    # SAVE UNSCORED SUBMISSION
+    if LOCAL_DEBUG:
+        print("mock uploaded submission", flush=True)
+    else:
+        try:
+            checked_upload_folder(
+                api=api,
+                folder_path=extracted_dir,
+                repo_id=SUBMISSION_DATASET,
+                config_name=CONFIG_NAME,
+                split=val_or_test,
+                submission_name=submission_name,
+            )
+        except ValueError as e:
+            return format_error(str(e))
+    # SAVE CONTACT
+    contact_info = {
+        "agent_name": agent_name,
+        "agent_description": agent_description,
+        "url": agent_url,
+        "username": username,
+        "username_auth": profile.username,
+        "mail": mail,
+        "submit_time": submission_time.isoformat(),
+    }
+    # add or init contact dataset for this split
+    if val_or_test in contact_infos:
+        contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
+    else:
+        contact_infos[val_or_test] = Dataset.from_list([contact_info])
+    if LOCAL_DEBUG:
+        print("mock uploaded contact info", flush=True)
+    else:
+        contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
+    try:
+        json_path = Path(extracted_dir) / AGENTEVAL_MANIFEST_NAME
+        if not json_path.exists():
+            return format_error(f"Missing manifest {AGENTEVAL_MANIFEST_NAME}")
+        raw = json_path.read_text(encoding="utf-8")
+        eval_result = EvalResult.model_validate_json(raw)
+        if eval_result.suite_config.version != CONFIG_NAME:
+            return format_error(
+                f"Error: submitted suite version {eval_result.suite_config.version} "
+                f"does not match currently accepted version {CONFIG_NAME}"
+            )
+        if eval_result.split != val_or_test:
+            return format_error(
+                f"Error: uploaded split {eval_result.split} does not match selected split {val_or_test}"
+            )
+        # NOTE: Trusting user-computed scores, but re-computing the derived results based on the log files
+        eval_result.results = process_eval_logs(extracted_dir)[0]
+        eval_result.save_json(str(json_path))
+    except Exception as e:
+        return format_error(
+            f"Error while scoring the submission: {e}. Be sure to upload a valid submission."
+        )
+    # # SAVE SCORED SUBMISSION
+    if LOCAL_DEBUG:
+        print("mock uploaded scored submission")
+    else:
+        try:
+            logs_url_private = checked_upload_folder(
+                api=api,
+                folder_path=extracted_dir,
+                repo_id=SUBMISSION_DATASET,
+                config_name=CONFIG_NAME,
+                split=val_or_test,
+                submission_name=f"{submission_name}_scored",
+            )
+        except ValueError as e:
+            return format_error(str(e))
+        # Validation submissions are public for public leaderboard
+        if is_validation and not IS_INTERNAL:
+            try:
+                logs_url_public = checked_upload_folder(
+                    api=api,
+                    folder_path=extracted_dir,
+                    repo_id=SUBMISSION_DATASET_PUBLIC,
+                    config_name=CONFIG_NAME,
+                    split=val_or_test,
+                    submission_name=f"{submission_name}_scored",
+                )
+            except ValueError as e:
+                return format_error(str(e))
+        else:
+            logs_url_public = None
+    eval_result.submission.agent_name = agent_name
+    eval_result.submission.agent_description = agent_description
+    eval_result.submission.agent_url = agent_url
+    eval_result.submission.username = username
+    eval_result.submission.submit_time = submission_time
+    eval_result.submission.logs_url = logs_url_private
+    eval_result.submission.logs_url_public = logs_url_public
+    if LOCAL_DEBUG:
+        print("mock uploaded results to lb")
+    else:
+        upload_summary_to_hf(
+            api=api,
+            eval_result=eval_result,
+            repo_id=RESULTS_DATASET,
+            config_name=CONFIG_NAME,
+            split=val_or_test,
+            submission_name=f"{submission_name}_scored",
+        )
+    return format_log(
+        f"Agent {agent_name} submitted by {username} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed."
+    )
+def refresh():
+    _, eval_dataframe_val, eval_dataframe_test = load_and_format_dataframes()
+    return eval_dataframe_val, eval_dataframe_test
+# Determine column types dynamically based on dataframe columns
+def compute_column_types(df):
+    col_types = []
+    for col in df.columns:
+        if col == "Agent":
+            col_types.append("markdown")
+        elif col in ["Agent description", "User/organization", "Submission date"]:
+            col_types.append("str")
+        elif col == "Logs":
+            col_types.append("markdown")
+        else:
+            col_types.append("number")
+    return col_types
+test_col_types = compute_column_types(eval_dataframe_test)
+val_col_types = compute_column_types(eval_dataframe_val)
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            )  # .style(show_copy_button=True)
+    leaderboard_table_test = gr.Dataframe(
+        value=eval_dataframe_test,
+        headers=list(eval_dataframe_test.columns),
+        datatype=test_col_types,
+        interactive=False,
+        column_widths=["20%"],
+        render=False,
+    )
+    leaderboard_table_val = gr.Dataframe(
+        value=eval_dataframe_val,
+        headers=list(eval_dataframe_val.columns),
+        datatype=val_col_types,
+        interactive=False,
+        column_widths=["20%"],
+        render=False,
+    )
+    # Build tab layout list based on desired order
+    tabs = [
+        ("Results: Test", leaderboard_table_test),
+        ("Results: Validation", leaderboard_table_val),
+    ]
+    if IS_INTERNAL:
+        tabs = [tabs[1], tabs[0]]  # Validation first for internal users
+    # Render the tabs in desired order
+    for label, component in tabs:
+        with gr.Tab(label):
+            component.render()
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            leaderboard_table_val,
+            leaderboard_table_test,
+        ],
+    )
+    with gr.Accordion("Submit a new agent for evaluation"):
+        with gr.Row():
+            gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
+        with gr.Row():
+            with gr.Column():
+                level_of_test = gr.Radio(
+                    ["validation", "test"], value="validation", label="Split"
+                )
+                agent_name_textbox = gr.Textbox(label="Agent name")
+                agent_description_textbox = gr.Textbox(label="Agent description")
+                agent_url_textbox = gr.Textbox(label="Url to agent information")
+            with gr.Column():
+                username = gr.Textbox(
+                    label="Organization or user name (defaults to your HF username)",
+                    placeholder="Leave blank to use your HF username",
+                )
+                mail = gr.Textbox(
+                    label="Contact email (will be stored privately, & used if there is an issue with your submission)"
+                )
+                file_output = gr.File()
+        with gr.Row():
+            gr.LoginButton()
+            submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                level_of_test,
+                agent_name_textbox,
+                agent_description_textbox,
+                agent_url_textbox,
+                file_output,
+                username,
+                mail,
+            ],
+            submission_result,
+        )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+if LOCAL_DEBUG:
+    demo.launch(debug=True)
+else:
+    demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

content.py ADDED Viewed

	@@ -0,0 +1,55 @@

+TITLE = """<h1 align="center" id="space-title">AstaBench Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+## Introduction
+"""
+SUBMISSION_TEXT = """
+## Submissions
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{asta-bench,
+    title={AstaBench},
+    author={AstaBench folks},
+    year={2025},
+    eprint={TBD.TBD},
+    archivePrefix={arXiv},
+    primaryClass={cs.AI},
+    secondaryClass={cs.CL}
+}"""
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def hyperlink(link, text):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{text}</a>'
+def hf_uri_to_web_url(uri: str) -> str:
+    """
+    Convert a Hugging Face-style URI like:
+        hf://datasets/{namespace}/{repo}/{path...}
+    into a public web URL:
+        https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path...}
+    """
+    prefix = "hf://datasets/"
+    if not uri.startswith(prefix):
+        raise ValueError("URI must start with 'hf://datasets/'")
+    parts = uri[len(prefix) :].split("/", 2)
+    if len(parts) < 3:
+        raise ValueError("Expected format: hf://datasets/{namespace}/{repo}/{path...}")
+    namespace, repo, path = parts
+    return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}"

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ black
2	+ isort

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets
+gradio[oauth]
+huggingface-hub
+APScheduler
+agent-eval @ git+https://github.com/allenai/agent-eval.git@d302cb5d0ba983ae5f0764c53fde4e017118d0df#egg=agent-eval