Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os.path | |
from typing import List | |
import pandas as pd | |
from src.benchmarks import BenchmarksQA, BenchmarksLongDoc | |
from src.display.utils import COLS_QA, COLS_LONG_DOC | |
from src.display.column_names import COL_NAME_AVG, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS | |
from src.models import FullEvalResult | |
pd.options.mode.copy_on_write = True | |
def calculate_mean(row): | |
if pd.isna(row).any(): | |
return -1 | |
else: | |
return row.mean() | |
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]: | |
""" | |
Load the evaluation results from a json file | |
""" | |
model_result_filepaths = [] | |
for root, dirs, files in os.walk(results_path): | |
if len(files) == 0: | |
continue | |
# select the latest results | |
for file in files: | |
if not (file.startswith("results") and file.endswith(".json")): | |
print(f'skip {file}') | |
continue | |
model_result_filepaths.append(os.path.join(root, file)) | |
eval_results = {} | |
for model_result_filepath in model_result_filepaths: | |
# create evaluation results | |
try: | |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath) | |
except UnicodeDecodeError as e: | |
print(f"loading file failed. {model_result_filepath}") | |
continue | |
print(f'file loaded: {model_result_filepath}') | |
timestamp = eval_result.timestamp | |
eval_results[timestamp] = eval_result | |
results = [] | |
for k, v in eval_results.items(): | |
try: | |
v.to_dict() | |
results.append(v) | |
except KeyError: | |
print(f"loading failed: {k}") | |
continue | |
return results | |
def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame: | |
""" | |
Creates a dataframe from all the individual experiment results | |
""" | |
cols = [COL_NAME_IS_ANONYMOUS, ] | |
if task == "qa": | |
cols += COLS_QA | |
benchmark_cols = [t.value.col_name for t in BenchmarksQA] | |
elif task == "long-doc": | |
cols += COLS_LONG_DOC | |
benchmark_cols = [t.value.col_name for t in BenchmarksLongDoc] | |
else: | |
raise NotImplemented | |
all_data_json = [] | |
for v in raw_data: | |
all_data_json += v.to_dict(task=task, metric=metric) | |
df = pd.DataFrame.from_records(all_data_json) | |
# print(f'dataframe created: {df.shape}') | |
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list())) | |
# calculate the average score for selected benchmarks | |
df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2) | |
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True) | |
df.reset_index(inplace=True, drop=True) | |
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list())) | |
df = df[_cols].round(decimals=2) | |
# filter out if any of the benchmarks have not been produced | |
df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min") | |
# shorten the revision | |
df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6] | |
# # replace "0" with "-" for average score | |
# df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-") | |
return df | |