File size: 11,549 Bytes
883eeae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import re
import gradio as gr
import json
import datetime
from email.utils import parseaddr

import pandas as pd
from datasets import load_dataset
from huggingface_hub import HfApi

from dabstep_benchmark.utils import format_log, format_error, format_warning, is_valid_https_url, evaluate

OWNER = "adyen"

HF_API = HfApi()
HF_LEADERBOARD = f"{OWNER}/DABstep"
HF_DATASET_PATH = f"{OWNER}/DABstep"
HF_INTERNAL_DATASET_PATH = f"{OWNER}/DABstep-internal"
HF_DATASET_CONFIGS = [
    "tasks",
    "submissions",
    "task_scores"
]
DATASETS = {}

def refresh(only_leaderboard: bool = False):
    if only_leaderboard:
        for config_name in ["task_scores", "submissions"]:
            DATASETS[f"{config_name}"] = load_dataset(
                path=HF_DATASET_PATH,
                name=config_name,
                split="default",
            )
            print(f"Downloaded {HF_DATASET_PATH}/{config_name}")

    else:
        for config_name in HF_DATASET_CONFIGS:
            DATASETS[f"{config_name}"] = load_dataset(
                path=HF_DATASET_PATH,
                name=config_name,
                split="default",
            )
            print(f"Downloaded {HF_DATASET_PATH}/{config_name}")

        DATASETS["internal_tasks"] = load_dataset(
            path=HF_INTERNAL_DATASET_PATH,
            name="tasks",
            split="default",
        )
        print(f"Downloaded {HF_INTERNAL_DATASET_PATH}/tasks")
        DATASETS["contact_info"] = load_dataset(
            path=HF_INTERNAL_DATASET_PATH,
            name="contact_info",
            split="default",
        )
        print(f"Downloaded {HF_INTERNAL_DATASET_PATH}/contact_info")

    return generate_leaderboard_df()


def validate_submission(submission_df: pd.DataFrame):
    # mandatory_columns = ["agent_answer", "task_id", "num_steps"]
    mandatory_columns = ["agent_answer", "task_id"]
    expected_columns = [*mandatory_columns, "reasoning_trace"]

    # Check for missing mandatory columns
    missing_columns = [col for col in mandatory_columns if col not in submission_df.columns]
    if missing_columns:
        return format_error(f"Missing mandatory columns: {', '.join(missing_columns)}")

    # Check for unexpected columns
    unexpected_columns = [col for col in submission_df.columns if col not in expected_columns]
    if unexpected_columns:
        return format_error(f"Unexpected columns: {', '.join(unexpected_columns)}")

    # Check for NaN values in any column
    if submission_df.isnull().values.any():
        return format_error("Submission contains NaN values. Please ensure no missing data.")

    # Check if all columns are of string type
    non_string_columns = [col for col in submission_df.columns if submission_df[col].dtype != 'object']
    if non_string_columns:
        return format_error(f"Columns with non-string data type: {', '.join(non_string_columns)}")

    return None  # No errors

def process_submission(
        split: str,
        agent_name: str,
        model_family: str,
        repo_url: str,
        path_to_file: str,
        organisation: str,
        mail: str,
):
    if agent_name == "":
        return format_warning("Please provide an agent name")
    if organisation == "":
        return format_warning("Please provide an organisation")
    if mail == "":
        return format_warning("Please provide an email")
    if model_family == "":
        return format_warning("Please provide a model family")

    allowed_pattern = re.compile(r'^[a-zA-Z0-9 _.-]+$')
    if not allowed_pattern.match(agent_name):
        return format_warning(
            f"{agent_name=} can only contain alphanumeric characters, spaces, dashes (-), and underscores (_)")

    if not allowed_pattern.match(organisation):
        return format_warning(
            f"{organisation=} can only contain alphanumeric characters, spaces, dashes (-), and underscores (_)")


    # very basic email parsing
    _, parsed_mail = parseaddr(mail)
    if not "@" in parsed_mail:
        return format_warning("Please provide a valid email address.")

    if repo_url != "" and not is_valid_https_url(repo_url):
        return format_warning("If you provide a URL it must be a valid one. You can also leave it empty")

    # submission file validation
    if path_to_file == None:
        return format_warning("Please attach a file.")
    submission_path = path_to_file.name
    try:
        submission_df = pd.read_json(submission_path, lines=True, dtype=str)
        validation_error = validate_submission(submission_df)
        if validation_error:
            return validation_error
    except Exception as exc:
        return format_error(f"Submission file is incorrectly formatted. Please fix it and resubmit your file. {str(exc)}")


    print(f"Processing submission_id={organisation}-{agent_name}...")
    gr.Info(f"Processing submission of {agent_name}...")
    refresh(only_leaderboard=False)
    submissions_df = DATASETS["submissions"].to_pandas()
    contact_info_df = DATASETS["contact_info"].to_pandas()
    internal_tasks_df = DATASETS["internal_tasks"].to_pandas()


    # check if this agent already was submitted
    submission_id = f"{organisation}-{agent_name}"
    if submission_id in submissions_df['submission_id'].values:
        return format_warning(f"This {submission_id} pair has been already submitted.")

    # process submission
    submission_df["submission_id"] = submission_id
    submission_df["agent_name"] = agent_name
    submission_df["model_family"] = model_family
    submission_df["organisation"] = organisation
    submission_df["repo_url"] = repo_url
    submission_df["date"] = datetime.date.today().strftime("%d-%m-%Y")

    # add empty reasoning trace if one is not provided to not break schema of datasets
    if "reasoning_trace" not in submission_df.columns:
        submission_df["reasoning_trace"] = ""

    # overwrite submission
    submission_df.to_json(submission_path, orient="records", lines=True)

    try:
        task_scores = evaluate(
            agent_answers=submission_df,
            tasks_with_gt=internal_tasks_df,
            submission_id=submission_id
        )
    except KeyError as exc:
        return format_error(str(exc))


    # save submitted file once evaluation has run correctly
    filename_id = f"v1__{organisation}-{agent_name}__{datetime.datetime.today().strftime('%d-%m-%Y')}"
    path_in_repo = f"data/submissions/{filename_id}.jsonl"
    HF_API.upload_file(
        repo_id=HF_DATASET_PATH,
        path_or_fileobj=submission_path,
        path_in_repo=path_in_repo,
        repo_type="dataset",
    )
    print(f"[submission_id={organisation}-{agent_name}] Pushed submission to {HF_DATASET_PATH}/{path_in_repo} !")

    # write scores to disk
    with open(f"data/task_scores/{filename_id}.jsonl", "w") as f:
        for score in task_scores:
            f.write(json.dumps(score) + "\n")

    # upload scores to hub dataset
    path_in_repo = f"data/task_scores/{filename_id}.jsonl"
    HF_API.upload_file(
        repo_id=HF_DATASET_PATH,
        path_or_fileobj=f"data/task_scores/{filename_id}.jsonl",
        path_in_repo=path_in_repo,
        repo_type="dataset",
    )
    print(f"[submission_id={organisation}-{agent_name}] Pushed task_scores to {HF_DATASET_PATH}/{path_in_repo} !")

    # if we already have this email dont save its metadata
    if mail not in contact_info_df["mail"].values:
        contact_info = {
            "submission_id": submission_id,
            "agent_name": agent_name,
            "model_family": model_family,
            "repo_url": repo_url,
            "organisation": organisation,
            "mail": mail,
            "date": datetime.date.today().strftime("%d-%m-%Y"),
        }
        contact_info_df = pd.concat([contact_info_df, pd.DataFrame([contact_info])], ignore_index=True)
        contact_info_df.to_json("contact_info.jsonl", orient="records", lines=True)

        HF_API.upload_file(
            repo_id=HF_INTERNAL_DATASET_PATH,
            path_or_fileobj="contact_info.jsonl",
            path_in_repo="contact_info.jsonl",
            repo_type="dataset",
        )
        print(f"[submission_id={organisation}-{agent_name}] Pushed contact_info to {HF_INTERNAL_DATASET_PATH}/contact_info.jsonl !")


    return format_log(
        f"""
        Agent {agent_name} submitted by {organisation} successfully.
        Please refresh the leaderboard to see your score displayed.
""")

def generate_leaderboard_df() -> pd.DataFrame:
    task_scores_df = DATASETS["task_scores"].to_pandas()
    submissions_df = DATASETS["submissions"].to_pandas()

    # get metadata of each submssion_id
    submissions_df = (
        submissions_df.groupby("submission_id")
        .first()
        .reset_index()[
            [
                "submission_id",
                "agent_name",
                "model_family",
                "organisation",
                "repo_url",
                "date"
            ]
        ]
    )

    # make num_steps a number
    # task_scores_df["num_steps"] = pd.to_numeric(task_scores_df["num_steps"], errors="coerce")

    # group scores per submission
    leaderboard_df = (
        task_scores_df.groupby(["submission_id", "level"])
        .agg(
            avg_score=("score", "mean"),
            # avg_num_steps=("num_steps", "mean")
        )
        .reset_index()
    )

    # reshape
    # leaderboard_df = leaderboard_df.pivot(index="submission_id", columns="level", values=["avg_score", "avg_num_steps"])
    leaderboard_df = leaderboard_df.pivot(index="submission_id", columns="level", values=["avg_score"])
    leaderboard_df.columns = [f"{metric}_lvl_{level}" for metric, level in leaderboard_df.columns]
    leaderboard_df = leaderboard_df.reset_index()

    # leaderboard_df["overall_avg_steps"] = (
    #         leaderboard_df.get("avg_num_steps_lvl_1", 0) +
    #         leaderboard_df.get("avg_num_steps_lvl_2", 0) +
    #         leaderboard_df.get("avg_num_steps_lvl_3", 0)
    # )
    # leaderboard_df["overall_avg_steps"] = leaderboard_df["overall_avg_steps"] / 3

    # join scores and submission metadata
    leaderboard_df = pd.merge(submissions_df, leaderboard_df, on="submission_id", how="inner")

    # renaming
    col_map = {
        "agent_name": "Agent",
        "avg_score_lvl_easy": "Easy Level Accuracy (%)",
        "avg_score_lvl_hard": "Hard Level Accuracy (%)",
        # "overall_avg_steps": "Overall Avg Reasoning Steps",
        # "avg_num_steps_lvl_1": "Level 1 Avg Reasoning Steps",
        # "avg_num_steps_lvl_2": "Level 2 Avg Reasoning Steps",
        # "avg_num_steps_lvl_3": "Level 3 Avg Reasoning Steps",
        "organisation": "Organization",
        "repo_url": "Repo URL",
        "model_family": "Model Family",
        "date": "Date"
    }
    col_order = [new_col_name for new_col_name in col_map.values()]
    leaderboard_df.rename(columns=col_map, inplace=True)
    df = leaderboard_df[col_order].copy()

    # formatting
    # convert scores to %
    df["Easy Level Accuracy (%)"] = df["Easy Level Accuracy (%)"].apply(lambda x: round(x * 100, 2))
    df["Hard Level Accuracy (%)"] = df["Hard Level Accuracy (%)"].apply(lambda x: round(x * 100, 2))

    # make repo url clickable in markdown
    df["Repo URL"] = df["Repo URL"].apply(lambda x: f"[Link]({x})" if x != "" else x)

    # make agent name bold
    df["Agent"] = df["Agent"].apply(lambda x: f"**{x}**")

    # sort-by best score
    df.sort_values(by="Hard Level Accuracy (%)", ascending=False, inplace=True)

    return df