Spaces:
AIR-Bench
/
Running on CPU Upgrade

leaderboard / src /read_evals.py
nan's picture
refactor: move the data model
4eb64b4
raw
history blame
3.25 kB
import os.path
from typing import List
import pandas as pd
from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
from src.display.utils import COLS_QA, COLS_LONG_DOC
from src.display.column_names import COL_NAME_AVG, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS
from src.models import FullEvalResult
pd.options.mode.copy_on_write = True
def calculate_mean(row):
if pd.isna(row).any():
return -1
else:
return row.mean()
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
"""
Load the evaluation results from a json file
"""
model_result_filepaths = []
for root, dirs, files in os.walk(results_path):
if len(files) == 0:
continue
# select the latest results
for file in files:
if not (file.startswith("results") and file.endswith(".json")):
print(f'skip {file}')
continue
model_result_filepaths.append(os.path.join(root, file))
eval_results = {}
for model_result_filepath in model_result_filepaths:
# create evaluation results
try:
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
except UnicodeDecodeError as e:
print(f"loading file failed. {model_result_filepath}")
continue
print(f'file loaded: {model_result_filepath}')
timestamp = eval_result.timestamp
eval_results[timestamp] = eval_result
results = []
for k, v in eval_results.items():
try:
v.to_dict()
results.append(v)
except KeyError:
print(f"loading failed: {k}")
continue
return results
def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
"""
Creates a dataframe from all the individual experiment results
"""
cols = [COL_NAME_IS_ANONYMOUS, ]
if task == "qa":
cols += COLS_QA
benchmark_cols = [t.value.col_name for t in BenchmarksQA]
elif task == "long-doc":
cols += COLS_LONG_DOC
benchmark_cols = [t.value.col_name for t in BenchmarksLongDoc]
else:
raise NotImplemented
all_data_json = []
for v in raw_data:
all_data_json += v.to_dict(task=task, metric=metric)
df = pd.DataFrame.from_records(all_data_json)
# print(f'dataframe created: {df.shape}')
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
# calculate the average score for selected benchmarks
df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
df.reset_index(inplace=True, drop=True)
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
df = df[_cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
# shorten the revision
df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
# # replace "0" with "-" for average score
# df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-")
return df