|
import re |
|
import gradio as gr |
|
import json |
|
import datetime |
|
from email.utils import parseaddr |
|
|
|
import pandas as pd |
|
from datasets import load_dataset |
|
from huggingface_hub import HfApi |
|
|
|
from dabstep_benchmark.utils import format_log, format_error, format_warning, is_valid_https_url, evaluate |
|
|
|
OWNER = "adyen" |
|
|
|
HF_API = HfApi() |
|
HF_LEADERBOARD = f"{OWNER}/DABstep" |
|
HF_DATASET_PATH = f"{OWNER}/DABstep" |
|
HF_INTERNAL_DATASET_PATH = f"{OWNER}/DABstep-internal" |
|
HF_DATASET_CONFIGS = [ |
|
"tasks", |
|
"submissions", |
|
"task_scores" |
|
] |
|
DATASETS = {} |
|
|
|
def refresh(only_leaderboard: bool = False): |
|
if only_leaderboard: |
|
for config_name in ["task_scores", "submissions"]: |
|
DATASETS[f"{config_name}"] = load_dataset( |
|
path=HF_DATASET_PATH, |
|
name=config_name, |
|
split="default", |
|
) |
|
print(f"Downloaded {HF_DATASET_PATH}/{config_name}") |
|
|
|
else: |
|
for config_name in HF_DATASET_CONFIGS: |
|
DATASETS[f"{config_name}"] = load_dataset( |
|
path=HF_DATASET_PATH, |
|
name=config_name, |
|
split="default", |
|
) |
|
print(f"Downloaded {HF_DATASET_PATH}/{config_name}") |
|
|
|
DATASETS["internal_tasks"] = load_dataset( |
|
path=HF_INTERNAL_DATASET_PATH, |
|
name="tasks", |
|
split="default", |
|
) |
|
print(f"Downloaded {HF_INTERNAL_DATASET_PATH}/tasks") |
|
DATASETS["contact_info"] = load_dataset( |
|
path=HF_INTERNAL_DATASET_PATH, |
|
name="contact_info", |
|
split="default", |
|
) |
|
print(f"Downloaded {HF_INTERNAL_DATASET_PATH}/contact_info") |
|
|
|
return generate_leaderboard_df() |
|
|
|
|
|
def validate_submission(submission_df: pd.DataFrame): |
|
|
|
mandatory_columns = ["agent_answer", "task_id"] |
|
expected_columns = [*mandatory_columns, "reasoning_trace"] |
|
|
|
|
|
missing_columns = [col for col in mandatory_columns if col not in submission_df.columns] |
|
if missing_columns: |
|
return format_error(f"Missing mandatory columns: {', '.join(missing_columns)}") |
|
|
|
|
|
unexpected_columns = [col for col in submission_df.columns if col not in expected_columns] |
|
if unexpected_columns: |
|
return format_error(f"Unexpected columns: {', '.join(unexpected_columns)}") |
|
|
|
|
|
if submission_df.isnull().values.any(): |
|
return format_error("Submission contains NaN values. Please ensure no missing data.") |
|
|
|
|
|
non_string_columns = [col for col in submission_df.columns if submission_df[col].dtype != 'object'] |
|
if non_string_columns: |
|
return format_error(f"Columns with non-string data type: {', '.join(non_string_columns)}") |
|
|
|
return None |
|
|
|
def process_submission( |
|
split: str, |
|
agent_name: str, |
|
model_family: str, |
|
repo_url: str, |
|
path_to_file: str, |
|
organisation: str, |
|
mail: str, |
|
): |
|
if agent_name == "": |
|
return format_warning("Please provide an agent name") |
|
if organisation == "": |
|
return format_warning("Please provide an organisation") |
|
if mail == "": |
|
return format_warning("Please provide an email") |
|
if model_family == "": |
|
return format_warning("Please provide a model family") |
|
|
|
allowed_pattern = re.compile(r'^[a-zA-Z0-9 _.-]+$') |
|
if not allowed_pattern.match(agent_name): |
|
return format_warning( |
|
f"{agent_name=} can only contain alphanumeric characters, spaces, dashes (-), and underscores (_)") |
|
|
|
if not allowed_pattern.match(organisation): |
|
return format_warning( |
|
f"{organisation=} can only contain alphanumeric characters, spaces, dashes (-), and underscores (_)") |
|
|
|
|
|
|
|
_, parsed_mail = parseaddr(mail) |
|
if not "@" in parsed_mail: |
|
return format_warning("Please provide a valid email address.") |
|
|
|
if repo_url != "" and not is_valid_https_url(repo_url): |
|
return format_warning("If you provide a URL it must be a valid one. You can also leave it empty") |
|
|
|
|
|
if path_to_file == None: |
|
return format_warning("Please attach a file.") |
|
submission_path = path_to_file.name |
|
try: |
|
submission_df = pd.read_json(submission_path, lines=True, dtype=str) |
|
validation_error = validate_submission(submission_df) |
|
if validation_error: |
|
return validation_error |
|
except Exception as exc: |
|
return format_error(f"Submission file is incorrectly formatted. Please fix it and resubmit your file. {str(exc)}") |
|
|
|
|
|
print(f"Processing submission_id={organisation}-{agent_name}...") |
|
gr.Info(f"Processing submission of {agent_name}...") |
|
refresh(only_leaderboard=False) |
|
submissions_df = DATASETS["submissions"].to_pandas() |
|
contact_info_df = DATASETS["contact_info"].to_pandas() |
|
internal_tasks_df = DATASETS["internal_tasks"].to_pandas() |
|
|
|
|
|
|
|
submission_id = f"{organisation}-{agent_name}" |
|
if submission_id in submissions_df['submission_id'].values: |
|
return format_warning(f"This {submission_id} pair has been already submitted.") |
|
|
|
|
|
submission_df["submission_id"] = submission_id |
|
submission_df["agent_name"] = agent_name |
|
submission_df["model_family"] = model_family |
|
submission_df["organisation"] = organisation |
|
submission_df["repo_url"] = repo_url |
|
submission_df["date"] = datetime.date.today().strftime("%d-%m-%Y") |
|
|
|
|
|
if "reasoning_trace" not in submission_df.columns: |
|
submission_df["reasoning_trace"] = "" |
|
|
|
|
|
submission_df.to_json(submission_path, orient="records", lines=True) |
|
|
|
try: |
|
task_scores = evaluate( |
|
agent_answers=submission_df, |
|
tasks_with_gt=internal_tasks_df, |
|
submission_id=submission_id |
|
) |
|
except KeyError as exc: |
|
return format_error(str(exc)) |
|
|
|
|
|
|
|
filename_id = f"v1__{organisation}-{agent_name}__{datetime.datetime.today().strftime('%d-%m-%Y')}" |
|
path_in_repo = f"data/submissions/{filename_id}.jsonl" |
|
HF_API.upload_file( |
|
repo_id=HF_DATASET_PATH, |
|
path_or_fileobj=submission_path, |
|
path_in_repo=path_in_repo, |
|
repo_type="dataset", |
|
) |
|
print(f"[submission_id={organisation}-{agent_name}] Pushed submission to {HF_DATASET_PATH}/{path_in_repo} !") |
|
|
|
|
|
with open(f"data/task_scores/{filename_id}.jsonl", "w") as f: |
|
for score in task_scores: |
|
f.write(json.dumps(score) + "\n") |
|
|
|
|
|
path_in_repo = f"data/task_scores/{filename_id}.jsonl" |
|
HF_API.upload_file( |
|
repo_id=HF_DATASET_PATH, |
|
path_or_fileobj=f"data/task_scores/{filename_id}.jsonl", |
|
path_in_repo=path_in_repo, |
|
repo_type="dataset", |
|
) |
|
print(f"[submission_id={organisation}-{agent_name}] Pushed task_scores to {HF_DATASET_PATH}/{path_in_repo} !") |
|
|
|
|
|
if mail not in contact_info_df["mail"].values: |
|
contact_info = { |
|
"submission_id": submission_id, |
|
"agent_name": agent_name, |
|
"model_family": model_family, |
|
"repo_url": repo_url, |
|
"organisation": organisation, |
|
"mail": mail, |
|
"date": datetime.date.today().strftime("%d-%m-%Y"), |
|
} |
|
contact_info_df = pd.concat([contact_info_df, pd.DataFrame([contact_info])], ignore_index=True) |
|
contact_info_df.to_json("contact_info.jsonl", orient="records", lines=True) |
|
|
|
HF_API.upload_file( |
|
repo_id=HF_INTERNAL_DATASET_PATH, |
|
path_or_fileobj="contact_info.jsonl", |
|
path_in_repo="contact_info.jsonl", |
|
repo_type="dataset", |
|
) |
|
print(f"[submission_id={organisation}-{agent_name}] Pushed contact_info to {HF_INTERNAL_DATASET_PATH}/contact_info.jsonl !") |
|
|
|
|
|
return format_log( |
|
f""" |
|
Agent {agent_name} submitted by {organisation} successfully. |
|
Please refresh the leaderboard to see your score displayed. |
|
""") |
|
|
|
def generate_leaderboard_df() -> pd.DataFrame: |
|
task_scores_df = DATASETS["task_scores"].to_pandas() |
|
submissions_df = DATASETS["submissions"].to_pandas() |
|
|
|
|
|
submissions_df = ( |
|
submissions_df.groupby("submission_id") |
|
.first() |
|
.reset_index()[ |
|
[ |
|
"submission_id", |
|
"agent_name", |
|
"model_family", |
|
"organisation", |
|
"repo_url", |
|
"date" |
|
] |
|
] |
|
) |
|
|
|
|
|
|
|
|
|
|
|
leaderboard_df = ( |
|
task_scores_df.groupby(["submission_id", "level"]) |
|
.agg( |
|
avg_score=("score", "mean"), |
|
|
|
) |
|
.reset_index() |
|
) |
|
|
|
|
|
|
|
leaderboard_df = leaderboard_df.pivot(index="submission_id", columns="level", values=["avg_score"]) |
|
leaderboard_df.columns = [f"{metric}_lvl_{level}" for metric, level in leaderboard_df.columns] |
|
leaderboard_df = leaderboard_df.reset_index() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_df = pd.merge(submissions_df, leaderboard_df, on="submission_id", how="inner") |
|
|
|
|
|
col_map = { |
|
"agent_name": "Agent", |
|
"avg_score_lvl_easy": "Easy Level Accuracy (%)", |
|
"avg_score_lvl_hard": "Hard Level Accuracy (%)", |
|
|
|
|
|
|
|
|
|
"organisation": "Organization", |
|
"repo_url": "Repo URL", |
|
"model_family": "Model Family", |
|
"date": "Date" |
|
} |
|
col_order = [new_col_name for new_col_name in col_map.values()] |
|
leaderboard_df.rename(columns=col_map, inplace=True) |
|
df = leaderboard_df[col_order].copy() |
|
|
|
|
|
|
|
df["Easy Level Accuracy (%)"] = df["Easy Level Accuracy (%)"].apply(lambda x: round(x * 100, 2)) |
|
df["Hard Level Accuracy (%)"] = df["Hard Level Accuracy (%)"].apply(lambda x: round(x * 100, 2)) |
|
|
|
|
|
df["Repo URL"] = df["Repo URL"].apply(lambda x: f"[Link]({x})" if x != "" else x) |
|
|
|
|
|
df["Agent"] = df["Agent"].apply(lambda x: f"**{x}**") |
|
|
|
|
|
df.sort_values(by="Hard Level Accuracy (%)", ascending=False, inplace=True) |
|
|
|
return df |
|
|