import os.path from typing import List import pandas as pd from src.benchmarks import BenchmarksQA, BenchmarksLongDoc from src.display.utils import COLS_QA, COLS_LONG_DOC from src.display.column_names import COL_NAME_AVG, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS from src.models import FullEvalResult pd.options.mode.copy_on_write = True def calculate_mean(row): if pd.isna(row).any(): return -1 else: return row.mean() def get_raw_eval_results(results_path: str) -> List[FullEvalResult]: """ Load the evaluation results from a json file """ model_result_filepaths = [] for root, dirs, files in os.walk(results_path): if len(files) == 0: continue # select the latest results for file in files: if not (file.startswith("results") and file.endswith(".json")): print(f'skip {file}') continue model_result_filepaths.append(os.path.join(root, file)) eval_results = {} for model_result_filepath in model_result_filepaths: # create evaluation results try: eval_result = FullEvalResult.init_from_json_file(model_result_filepath) except UnicodeDecodeError as e: print(f"loading file failed. {model_result_filepath}") continue print(f'file loaded: {model_result_filepath}') timestamp = eval_result.timestamp eval_results[timestamp] = eval_result results = [] for k, v in eval_results.items(): try: v.to_dict() results.append(v) except KeyError: print(f"loading failed: {k}") continue return results def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame: """ Creates a dataframe from all the individual experiment results """ cols = [COL_NAME_IS_ANONYMOUS, ] if task == "qa": cols += COLS_QA benchmark_cols = [t.value.col_name for t in BenchmarksQA] elif task == "long-doc": cols += COLS_LONG_DOC benchmark_cols = [t.value.col_name for t in BenchmarksLongDoc] else: raise NotImplemented all_data_json = [] for v in raw_data: all_data_json += v.to_dict(task=task, metric=metric) df = pd.DataFrame.from_records(all_data_json) # print(f'dataframe created: {df.shape}') _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list())) # calculate the average score for selected benchmarks df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2) df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True) df.reset_index(inplace=True, drop=True) _cols = frozenset(cols).intersection(frozenset(df.columns.to_list())) df = df[_cols].round(decimals=2) # filter out if any of the benchmarks have not been produced df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min") # shorten the revision df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6] # # replace "0" with "-" for average score # df[COL_NAME_AVG] = df[COL_NAME_AVG].replace(0, "-") return df