Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: implement the version selector for qa
Browse files- app.py +52 -31
- src/benchmarks.py +38 -21
- src/display/columns.py +24 -20
- src/display/gradio_formatting.py +9 -2
- src/envs.py +10 -1
- src/loaders.py +31 -22
- src/models.py +3 -1
- src/utils.py +107 -51
- tests/src/test_benchmarks.py +8 -3
- tests/test_utils.py +4 -3
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
|
4 |
from src.about import (
|
@@ -13,7 +14,7 @@ from src.display.css_html_js import custom_css
|
|
13 |
from src.envs import (
|
14 |
API,
|
15 |
EVAL_RESULTS_PATH,
|
16 |
-
REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST
|
17 |
)
|
18 |
from src.loaders import (
|
19 |
load_eval_results
|
@@ -48,9 +49,10 @@ def restart_space():
|
|
48 |
# print(f'failed to download')
|
49 |
# restart_space()
|
50 |
|
51 |
-
|
52 |
data = load_eval_results(EVAL_RESULTS_PATH)
|
53 |
-
|
|
|
54 |
|
55 |
def update_metric_qa(
|
56 |
metric: str,
|
@@ -60,28 +62,36 @@ def update_metric_qa(
|
|
60 |
query: str,
|
61 |
show_anonymous: bool,
|
62 |
show_revision_and_timestamp: bool,
|
63 |
-
selected_version: str,
|
64 |
-
):
|
65 |
-
return update_metric(data[selected_version].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
66 |
-
|
67 |
-
|
68 |
-
def update_metric_long_doc(
|
69 |
-
metric: str,
|
70 |
-
domains: list,
|
71 |
-
langs: list,
|
72 |
-
reranking_model: list,
|
73 |
-
query: str,
|
74 |
-
show_anonymous: bool,
|
75 |
-
show_revision_and_timestamp,
|
76 |
):
|
77 |
-
return update_metric(
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
demo = gr.Blocks(css=custom_css)
|
87 |
|
@@ -99,11 +109,12 @@ with demo:
|
|
99 |
with gr.Column(min_width=320):
|
100 |
# select domain
|
101 |
with gr.Row():
|
102 |
-
selected_domains = get_domain_dropdown(
|
|
|
103 |
# select language
|
104 |
with gr.Row():
|
105 |
-
selected_langs = get_language_dropdown(
|
106 |
-
|
107 |
with gr.Column():
|
108 |
# select the metric
|
109 |
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
|
@@ -119,16 +130,25 @@ with demo:
|
|
119 |
search_bar = get_search_bar()
|
120 |
# select reranking models
|
121 |
with gr.Column():
|
122 |
-
selected_rerankings = get_reranking_dropdown(
|
123 |
-
|
|
|
|
|
124 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
125 |
-
hidden_leaderboard_table_for_search = get_leaderboard_table(
|
|
|
126 |
|
|
|
|
|
|
|
|
|
|
|
127 |
set_listeners(
|
128 |
"qa",
|
129 |
leaderboard_table,
|
130 |
hidden_leaderboard_table_for_search,
|
131 |
search_bar,
|
|
|
132 |
selected_domains,
|
133 |
selected_langs,
|
134 |
selected_rerankings,
|
@@ -147,7 +167,6 @@ with demo:
|
|
147 |
search_bar,
|
148 |
show_anonymous,
|
149 |
show_revision_and_timestamp,
|
150 |
-
selected_version,
|
151 |
],
|
152 |
leaderboard_table,
|
153 |
queue=True
|
@@ -468,3 +487,5 @@ if __name__ == "__main__":
|
|
468 |
scheduler.start()
|
469 |
demo.queue(default_concurrency_limit=40)
|
470 |
demo.launch()
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
|
5 |
from src.about import (
|
|
|
14 |
from src.envs import (
|
15 |
API,
|
16 |
EVAL_RESULTS_PATH,
|
17 |
+
REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION
|
18 |
)
|
19 |
from src.loaders import (
|
20 |
load_eval_results
|
|
|
49 |
# print(f'failed to download')
|
50 |
# restart_space()
|
51 |
|
52 |
+
global data
|
53 |
data = load_eval_results(EVAL_RESULTS_PATH)
|
54 |
+
global datastore
|
55 |
+
datastore = data[LATEST_BENCHMARK_VERSION]
|
56 |
|
57 |
def update_metric_qa(
|
58 |
metric: str,
|
|
|
62 |
query: str,
|
63 |
show_anonymous: bool,
|
64 |
show_revision_and_timestamp: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
):
|
66 |
+
return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
67 |
+
|
68 |
+
|
69 |
+
# def update_metric_long_doc(
|
70 |
+
# metric: str,
|
71 |
+
# domains: list,
|
72 |
+
# langs: list,
|
73 |
+
# reranking_model: list,
|
74 |
+
# query: str,
|
75 |
+
# show_anonymous: bool,
|
76 |
+
# show_revision_and_timestamp,
|
77 |
+
# ):
|
78 |
+
# return update_metric(datastore.raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
|
79 |
+
|
80 |
+
|
81 |
+
def update_datastore(version):
|
82 |
+
global datastore
|
83 |
+
global data
|
84 |
+
datastore = data[version]
|
85 |
+
selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug])
|
86 |
+
selected_langs = get_language_dropdown(QABenchmarks[datastore.slug])
|
87 |
+
selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
|
88 |
+
leaderboard_table = get_leaderboard_table(
|
89 |
+
datastore.raw_df_qa, datastore.types_qa)
|
90 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(
|
91 |
+
datastore.raw_df_qa, datastore.types_qa, visible=False)
|
92 |
+
return selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table_for_search
|
93 |
+
# DOMAIN_COLS_LONG_DOC = list(frozenset([c.value.domain for c in list(LongDocBenchmarks)]))
|
94 |
+
# LANG_COLS_LONG_DOC = list(frozenset([c.value.lang for c in list(LongDocBenchmarks)]))
|
95 |
|
96 |
demo = gr.Blocks(css=custom_css)
|
97 |
|
|
|
109 |
with gr.Column(min_width=320):
|
110 |
# select domain
|
111 |
with gr.Row():
|
112 |
+
selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug])
|
113 |
+
# selected_domains = get_domain_dropdown(QABenchmarks["2404"])
|
114 |
# select language
|
115 |
with gr.Row():
|
116 |
+
selected_langs = get_language_dropdown(QABenchmarks[datastore.slug])
|
117 |
+
# selected_langs = get_language_dropdown(QABenchmarks["2404"])
|
118 |
with gr.Column():
|
119 |
# select the metric
|
120 |
selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
|
|
|
130 |
search_bar = get_search_bar()
|
131 |
# select reranking models
|
132 |
with gr.Column():
|
133 |
+
selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
|
134 |
+
# shown_table
|
135 |
+
leaderboard_table = get_leaderboard_table(
|
136 |
+
datastore.leaderboard_df_qa, datastore.types_qa)
|
137 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
138 |
+
hidden_leaderboard_table_for_search = get_leaderboard_table(
|
139 |
+
datastore.raw_df_qa, datastore.types_qa, visible=False)
|
140 |
|
141 |
+
selected_version.change(
|
142 |
+
update_datastore,
|
143 |
+
[selected_version,],
|
144 |
+
[selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table_for_search]
|
145 |
+
)
|
146 |
set_listeners(
|
147 |
"qa",
|
148 |
leaderboard_table,
|
149 |
hidden_leaderboard_table_for_search,
|
150 |
search_bar,
|
151 |
+
selected_version,
|
152 |
selected_domains,
|
153 |
selected_langs,
|
154 |
selected_rerankings,
|
|
|
167 |
search_bar,
|
168 |
show_anonymous,
|
169 |
show_revision_and_timestamp,
|
|
|
170 |
],
|
171 |
leaderboard_table,
|
172 |
queue=True
|
|
|
487 |
scheduler.start()
|
488 |
demo.queue(default_concurrency_limit=40)
|
489 |
demo.launch()
|
490 |
+
|
491 |
+
|
src/benchmarks.py
CHANGED
@@ -26,33 +26,50 @@ class Benchmark:
|
|
26 |
|
27 |
|
28 |
# create a function return an enum class containing all the benchmarks
|
29 |
-
def get_benchmarks_enum(benchmark_version):
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
benchmark_name = get_safe_name(
|
38 |
col_name = benchmark_name
|
39 |
for metric in dataset_list:
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
for dataset in dataset_list:
|
45 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
46 |
benchmark_name = get_safe_name(benchmark_name)
|
47 |
col_name = benchmark_name
|
48 |
for metric in METRIC_LIST:
|
49 |
-
|
50 |
-
Benchmark(
|
51 |
-
|
52 |
-
lang, task)
|
53 |
-
return qa_benchmark_dict, long_doc_benchmark_dict
|
54 |
|
55 |
-
_qa_benchmark_dict, _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04')
|
56 |
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
# create a function return an enum class containing all the benchmarks
|
29 |
+
def get_benchmarks_enum(benchmark_version, task_type):
|
30 |
+
benchmark_dict = {}
|
31 |
+
if task_type == "qa":
|
32 |
+
for task, domain_dict in BenchmarkTable[benchmark_version].items():
|
33 |
+
if task != task_type:
|
34 |
+
continue
|
35 |
+
for domain, lang_dict in domain_dict.items():
|
36 |
+
for lang, dataset_list in lang_dict.items():
|
37 |
+
benchmark_name = get_safe_name(f"{domain}_{lang}")
|
38 |
col_name = benchmark_name
|
39 |
for metric in dataset_list:
|
40 |
+
if "test" not in dataset_list[metric]["splits"]:
|
41 |
+
continue
|
42 |
+
benchmark_dict[benchmark_name] = \
|
43 |
+
Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
44 |
+
elif task_type == "long-doc":
|
45 |
+
for task, domain_dict in BenchmarkTable[benchmark_version].items():
|
46 |
+
if task != task_type:
|
47 |
+
continue
|
48 |
+
for domain, lang_dict in domain_dict.items():
|
49 |
+
for lang, dataset_list in lang_dict.items():
|
50 |
for dataset in dataset_list:
|
51 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
52 |
benchmark_name = get_safe_name(benchmark_name)
|
53 |
col_name = benchmark_name
|
54 |
for metric in METRIC_LIST:
|
55 |
+
benchmark_dict[benchmark_name] = \
|
56 |
+
Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
57 |
+
return benchmark_dict
|
|
|
|
|
58 |
|
|
|
59 |
|
60 |
+
versions = ("AIR-Bench_24.04", "AIR-Bench_24.05")
|
61 |
+
qa_benchmark_dict = {}
|
62 |
+
for version in versions:
|
63 |
+
safe_version_name = get_safe_name(version)[-4:]
|
64 |
+
qa_benchmark_dict[safe_version_name] = Enum(f"QABenchmarks_{safe_version_name}", get_benchmarks_enum(version, "qa"))
|
65 |
+
|
66 |
+
long_doc_benchmark_dict = {}
|
67 |
+
for version in versions:
|
68 |
+
safe_version_name = get_safe_name(version)[-4:]
|
69 |
+
long_doc_benchmark_dict[safe_version_name] = Enum(f"LongDocBenchmarks_{safe_version_name}", get_benchmarks_enum(version, "long-doc"))
|
70 |
+
|
71 |
+
# _qa_benchmark_dict, = get_benchmarks_enum('AIR-Bench_24.04', "qa")
|
72 |
+
# _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04', "long-doc")
|
73 |
+
|
74 |
+
QABenchmarks = Enum('QABenchmarks', qa_benchmark_dict)
|
75 |
+
LongDocBenchmarks = Enum('LongDocBenchmarks', long_doc_benchmark_dict)
|
src/display/columns.py
CHANGED
@@ -1,16 +1,8 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
|
3 |
from src.benchmarks import QABenchmarks, LongDocBenchmarks
|
4 |
-
|
5 |
-
|
6 |
-
COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
|
7 |
-
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
8 |
-
COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
|
9 |
-
COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
|
10 |
-
COL_NAME_RANK = "Rank 🏆"
|
11 |
-
COL_NAME_REVISION = "Revision"
|
12 |
-
COL_NAME_TIMESTAMP = "Submission Date"
|
13 |
-
COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
|
14 |
|
15 |
|
16 |
def fields(raw_class):
|
@@ -69,7 +61,7 @@ def get_default_auto_eval_column_dict():
|
|
69 |
def make_autoevalcolumn(cls_name, benchmarks):
|
70 |
auto_eval_column_dict = get_default_auto_eval_column_dict()
|
71 |
# Leaderboard columns
|
72 |
-
for benchmark in benchmarks:
|
73 |
auto_eval_column_dict.append(
|
74 |
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
|
75 |
)
|
@@ -78,16 +70,28 @@ def make_autoevalcolumn(cls_name, benchmarks):
|
|
78 |
return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
|
79 |
|
80 |
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
fixed_cols = get_default_auto_eval_column_dict()[:-3]
|
85 |
|
86 |
-
|
87 |
-
|
|
|
88 |
|
89 |
# Column selection
|
90 |
-
COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
|
91 |
-
COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
|
92 |
-
TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
|
93 |
-
TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
|
3 |
from src.benchmarks import QABenchmarks, LongDocBenchmarks
|
4 |
+
from src.envs import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
|
5 |
+
COL_NAME_RERANKING_MODEL_LINK, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
def fields(raw_class):
|
|
|
61 |
def make_autoevalcolumn(cls_name, benchmarks):
|
62 |
auto_eval_column_dict = get_default_auto_eval_column_dict()
|
63 |
# Leaderboard columns
|
64 |
+
for benchmark in list(benchmarks.value):
|
65 |
auto_eval_column_dict.append(
|
66 |
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
|
67 |
)
|
|
|
70 |
return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
|
71 |
|
72 |
|
73 |
+
def get_default_col_names_and_types(benchmarks):
|
74 |
+
AutoEvalColumn = make_autoevalcolumn("AutoEvalColumn", benchmarks)
|
75 |
+
col_names = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
76 |
+
col_types = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
77 |
+
return col_names, col_types
|
78 |
+
|
79 |
+
# AutoEvalColumnQA = make_autoevalcolumn("AutoEvalColumnQA", QABenchmarks)
|
80 |
+
# COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
|
81 |
+
# TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
|
82 |
+
|
83 |
+
|
84 |
+
def get_fixed_col_names_and_types():
|
85 |
+
fixed_cols = get_default_auto_eval_column_dict()[:-3]
|
86 |
+
return [c.name for _, _, c in fixed_cols], [c.type for _, _, c in fixed_cols]
|
87 |
+
|
88 |
+
# fixed_cols = get_default_auto_eval_column_dict()[:-3]
|
89 |
+
# FIXED_COLS = [c.name for _, _, c in fixed_cols]
|
90 |
+
# FIXED_COLS_TYPES = [c.type for _, _, c in fixed_cols]
|
91 |
|
|
|
92 |
|
93 |
+
# AutoEvalColumnLongDoc = make_autoevalcolumn("AutoEvalColumnLongDoc", LongDocBenchmarks)
|
94 |
+
# COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
|
95 |
+
# TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
|
96 |
|
97 |
# Column selection
|
|
|
|
|
|
|
|
src/display/gradio_formatting.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
|
|
|
3 |
|
4 |
def get_version_dropdown():
|
5 |
return gr.Dropdown(
|
@@ -52,7 +53,10 @@ def get_metric_dropdown(metric_list, default_metrics):
|
|
52 |
)
|
53 |
|
54 |
|
55 |
-
def get_domain_dropdown(
|
|
|
|
|
|
|
56 |
return gr.CheckboxGroup(
|
57 |
choices=domain_list,
|
58 |
value=default_domains,
|
@@ -61,7 +65,10 @@ def get_domain_dropdown(domain_list, default_domains):
|
|
61 |
)
|
62 |
|
63 |
|
64 |
-
def get_language_dropdown(
|
|
|
|
|
|
|
65 |
return gr.Dropdown(
|
66 |
choices=language_list,
|
67 |
value=default_languages,
|
|
|
1 |
import gradio as gr
|
2 |
from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
|
3 |
+
from src.benchmarks import QABenchmarks
|
4 |
|
5 |
def get_version_dropdown():
|
6 |
return gr.Dropdown(
|
|
|
53 |
)
|
54 |
|
55 |
|
56 |
+
def get_domain_dropdown(benchmarks, default_domains=None):
|
57 |
+
domain_list = list(frozenset([c.value.domain for c in list(benchmarks.value)]))
|
58 |
+
if default_domains is None:
|
59 |
+
default_domains = domain_list
|
60 |
return gr.CheckboxGroup(
|
61 |
choices=domain_list,
|
62 |
value=default_domains,
|
|
|
65 |
)
|
66 |
|
67 |
|
68 |
+
def get_language_dropdown(benchmarks, default_languages=None):
|
69 |
+
language_list = list(frozenset([c.value.lang for c in list(benchmarks.value)]))
|
70 |
+
if default_languages is None:
|
71 |
+
default_languages = language_list
|
72 |
return gr.Dropdown(
|
73 |
choices=language_list,
|
74 |
value=default_languages,
|
src/envs.py
CHANGED
@@ -27,7 +27,7 @@ BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
|
|
27 |
|
28 |
BENCHMARK_VERSION_LIST = [
|
29 |
"AIR-Bench_24.04",
|
30 |
-
|
31 |
]
|
32 |
|
33 |
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
|
@@ -65,3 +65,12 @@ METRIC_LIST = [
|
|
65 |
"mrr_at_100",
|
66 |
"mrr_at_1000"
|
67 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
BENCHMARK_VERSION_LIST = [
|
29 |
"AIR-Bench_24.04",
|
30 |
+
"AIR-Bench_24.05",
|
31 |
]
|
32 |
|
33 |
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
|
|
|
65 |
"mrr_at_100",
|
66 |
"mrr_at_1000"
|
67 |
]
|
68 |
+
COL_NAME_AVG = "Average ⬆️"
|
69 |
+
COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
|
70 |
+
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
71 |
+
COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
|
72 |
+
COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
|
73 |
+
COL_NAME_RANK = "Rank 🏆"
|
74 |
+
COL_NAME_REVISION = "Revision"
|
75 |
+
COL_NAME_TIMESTAMP = "Submission Date"
|
76 |
+
COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
|
src/loaders.py
CHANGED
@@ -3,8 +3,8 @@ from typing import List
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
|
7 |
-
|
8 |
|
9 |
from src.models import FullEvalResult, LeaderboardDataStore
|
10 |
from src.utils import get_default_cols, get_leaderboard_df
|
@@ -50,34 +50,43 @@ def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
50 |
continue
|
51 |
return results
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
lb_data_store.raw_data = load_raw_eval_results(file_path)
|
57 |
print(f'raw data: {len(lb_data_store.raw_data)}')
|
58 |
|
59 |
lb_data_store.raw_df_qa = get_leaderboard_df(
|
60 |
-
lb_data_store
|
61 |
print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
|
62 |
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
|
63 |
-
shown_columns_qa, types_qa = get_default_cols(
|
64 |
-
|
|
|
65 |
lb_data_store.types_qa = types_qa
|
66 |
lb_data_store.leaderboard_df_qa = \
|
67 |
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
68 |
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
69 |
|
70 |
-
lb_data_store.raw_df_long_doc = get_leaderboard_df(
|
71 |
-
|
72 |
-
print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
|
73 |
-
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
|
74 |
-
shown_columns_long_doc, types_long_doc = get_default_cols(
|
75 |
-
|
76 |
-
lb_data_store.types_long_doc = types_long_doc
|
77 |
-
lb_data_store.leaderboard_df_long_doc = \
|
78 |
-
|
79 |
-
|
80 |
-
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
81 |
|
82 |
lb_data_store.reranking_models = sorted(
|
83 |
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
|
@@ -86,8 +95,8 @@ def load_leaderboard_datastore(file_path) -> LeaderboardDataStore:
|
|
86 |
|
87 |
def load_eval_results(file_path: str):
|
88 |
output = {}
|
89 |
-
versions =
|
90 |
-
for version in
|
91 |
fn = f"{file_path}/{version}"
|
92 |
-
output[version] = load_leaderboard_datastore(fn)
|
93 |
return output
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
|
7 |
+
COL_NAME_IS_ANONYMOUS, BENCHMARK_VERSION_LIST
|
8 |
|
9 |
from src.models import FullEvalResult, LeaderboardDataStore
|
10 |
from src.utils import get_default_cols, get_leaderboard_df
|
|
|
50 |
continue
|
51 |
return results
|
52 |
|
53 |
+
def get_safe_name(name: str):
|
54 |
+
"""Get RFC 1123 compatible safe name"""
|
55 |
+
name = name.replace('-', '_')
|
56 |
+
return ''.join(
|
57 |
+
character.lower()
|
58 |
+
for character in name
|
59 |
+
if (character.isalnum() or character == '_'))
|
60 |
+
|
61 |
+
def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
|
62 |
+
slug = get_safe_name(version)[-4:]
|
63 |
+
lb_data_store = LeaderboardDataStore(version, slug, None, None, None, None, None, None, None, None)
|
64 |
lb_data_store.raw_data = load_raw_eval_results(file_path)
|
65 |
print(f'raw data: {len(lb_data_store.raw_data)}')
|
66 |
|
67 |
lb_data_store.raw_df_qa = get_leaderboard_df(
|
68 |
+
lb_data_store, task='qa', metric=DEFAULT_METRIC_QA)
|
69 |
print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
|
70 |
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
|
71 |
+
shown_columns_qa, types_qa = get_default_cols('qa', lb_data_store.slug, add_fix_cols=True)
|
72 |
+
# shown_columns_qa, types_qa = get_default_cols(
|
73 |
+
# 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
|
74 |
lb_data_store.types_qa = types_qa
|
75 |
lb_data_store.leaderboard_df_qa = \
|
76 |
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
|
77 |
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
78 |
|
79 |
+
# lb_data_store.raw_df_long_doc = get_leaderboard_df(
|
80 |
+
# lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
|
81 |
+
# print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
|
82 |
+
# lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
|
83 |
+
# shown_columns_long_doc, types_long_doc = get_default_cols(
|
84 |
+
# 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
|
85 |
+
# lb_data_store.types_long_doc = types_long_doc
|
86 |
+
# lb_data_store.leaderboard_df_long_doc = \
|
87 |
+
# lb_data_store.leaderboard_df_long_doc[
|
88 |
+
# ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
|
89 |
+
# lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
90 |
|
91 |
lb_data_store.reranking_models = sorted(
|
92 |
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
|
|
|
95 |
|
96 |
def load_eval_results(file_path: str):
|
97 |
output = {}
|
98 |
+
# versions = BENCHMARK_VERSION_LIST
|
99 |
+
for version in BENCHMARK_VERSION_LIST:
|
100 |
fn = f"{file_path}/{version}"
|
101 |
+
output[version] = load_leaderboard_datastore(fn, version)
|
102 |
return output
|
src/models.py
CHANGED
@@ -6,7 +6,7 @@ from typing import List, Optional
|
|
6 |
import pandas as pd
|
7 |
|
8 |
from src.benchmarks import get_safe_name
|
9 |
-
from src.
|
10 |
COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
11 |
from src.display.formatting import make_clickable_model
|
12 |
|
@@ -128,6 +128,8 @@ class FullEvalResult:
|
|
128 |
|
129 |
@dataclass
|
130 |
class LeaderboardDataStore:
|
|
|
|
|
131 |
raw_data: Optional[list]
|
132 |
raw_df_qa: Optional[pd.DataFrame]
|
133 |
raw_df_long_doc: Optional[pd.DataFrame]
|
|
|
6 |
import pandas as pd
|
7 |
|
8 |
from src.benchmarks import get_safe_name
|
9 |
+
from src.envs import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
|
10 |
COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
11 |
from src.display.formatting import make_clickable_model
|
12 |
|
|
|
128 |
|
129 |
@dataclass
|
130 |
class LeaderboardDataStore:
|
131 |
+
version: str
|
132 |
+
slug: str
|
133 |
raw_data: Optional[list]
|
134 |
raw_df_qa: Optional[pd.DataFrame]
|
135 |
raw_df_long_doc: Optional[pd.DataFrame]
|
src/utils.py
CHANGED
@@ -2,17 +2,14 @@ import json
|
|
2 |
import hashlib
|
3 |
from datetime import datetime, timezone
|
4 |
from pathlib import Path
|
5 |
-
from typing import List
|
6 |
|
7 |
import pandas as pd
|
8 |
|
9 |
from src.benchmarks import QABenchmarks, LongDocBenchmarks
|
10 |
from src.display.formatting import styled_message, styled_error
|
11 |
-
from src.display.columns import
|
12 |
-
|
13 |
-
|
14 |
-
from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
|
15 |
-
from src.models import FullEvalResult
|
16 |
|
17 |
import re
|
18 |
|
@@ -62,61 +59,95 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
62 |
return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
|
63 |
|
64 |
|
65 |
-
def get_default_cols(task: str,
|
66 |
cols = []
|
67 |
types = []
|
68 |
if task == "qa":
|
69 |
-
|
70 |
-
types_list =
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
types_list = TYPES_LONG_DOC
|
75 |
-
benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
|
76 |
else:
|
77 |
raise NotImplemented
|
78 |
for col_name, col_type in zip(cols_list, types_list):
|
79 |
if col_name not in benchmark_list:
|
80 |
continue
|
81 |
-
if len(columns) > 0 and col_name not in columns:
|
82 |
-
continue
|
83 |
cols.append(col_name)
|
84 |
types.append(col_type)
|
85 |
|
86 |
if add_fix_cols:
|
87 |
_cols = []
|
88 |
_types = []
|
|
|
89 |
for col_name, col_type in zip(cols, types):
|
90 |
-
if col_name in
|
91 |
continue
|
92 |
_cols.append(col_name)
|
93 |
_types.append(col_type)
|
94 |
-
cols =
|
95 |
-
types =
|
96 |
return cols, types
|
97 |
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
def select_columns(
|
100 |
df: pd.DataFrame,
|
101 |
domain_query: list,
|
102 |
language_query: list,
|
103 |
task: str = "qa",
|
104 |
-
reset_ranking: bool = True
|
|
|
105 |
) -> pd.DataFrame:
|
106 |
-
cols, _ = get_default_cols(task=task,
|
107 |
selected_cols = []
|
108 |
for c in cols:
|
109 |
if task == "qa":
|
110 |
-
eval_col = QABenchmarks[c].value
|
111 |
elif task == "long-doc":
|
112 |
-
eval_col = LongDocBenchmarks[c].value
|
113 |
if eval_col.domain not in domain_query:
|
114 |
continue
|
115 |
if eval_col.lang not in language_query:
|
116 |
continue
|
117 |
selected_cols.append(c)
|
118 |
# We use COLS to maintain sorting
|
119 |
-
|
|
|
|
|
120 |
if reset_ranking:
|
121 |
filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
|
122 |
filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
@@ -125,9 +156,17 @@ def select_columns(
|
|
125 |
|
126 |
return filtered_df
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
def _update_table(
|
130 |
task: str,
|
|
|
131 |
hidden_df: pd.DataFrame,
|
132 |
domains: list,
|
133 |
langs: list,
|
@@ -137,33 +176,24 @@ def _update_table(
|
|
137 |
reset_ranking: bool = True,
|
138 |
show_revision_and_timestamp: bool = False
|
139 |
):
|
|
|
|
|
|
|
|
|
|
|
140 |
filtered_df = hidden_df.copy()
|
141 |
if not show_anonymous:
|
142 |
filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
|
143 |
filtered_df = filter_models(filtered_df, reranking_query)
|
144 |
filtered_df = filter_queries(query, filtered_df)
|
145 |
-
filtered_df = select_columns(filtered_df, domains, langs, task, reset_ranking)
|
146 |
if not show_revision_and_timestamp:
|
147 |
filtered_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
148 |
return filtered_df
|
149 |
|
150 |
|
151 |
-
def update_table(
|
152 |
-
hidden_df: pd.DataFrame,
|
153 |
-
domains: list,
|
154 |
-
langs: list,
|
155 |
-
reranking_query: list,
|
156 |
-
query: str,
|
157 |
-
show_anonymous: bool,
|
158 |
-
show_revision_and_timestamp: bool = False,
|
159 |
-
reset_ranking: bool = True
|
160 |
-
):
|
161 |
-
return _update_table(
|
162 |
-
"qa",
|
163 |
-
hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
|
164 |
-
|
165 |
-
|
166 |
def update_table_long_doc(
|
|
|
167 |
hidden_df: pd.DataFrame,
|
168 |
domains: list,
|
169 |
langs: list,
|
@@ -176,11 +206,12 @@ def update_table_long_doc(
|
|
176 |
):
|
177 |
return _update_table(
|
178 |
"long-doc",
|
|
|
179 |
hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
|
180 |
|
181 |
|
182 |
def update_metric(
|
183 |
-
|
184 |
task: str,
|
185 |
metric: str,
|
186 |
domains: list,
|
@@ -190,9 +221,12 @@ def update_metric(
|
|
190 |
show_anonymous: bool = False,
|
191 |
show_revision_and_timestamp: bool = False,
|
192 |
) -> pd.DataFrame:
|
|
|
193 |
if task == 'qa':
|
194 |
-
leaderboard_df = get_leaderboard_df(
|
|
|
195 |
return update_table(
|
|
|
196 |
leaderboard_df,
|
197 |
domains,
|
198 |
langs,
|
@@ -202,8 +236,10 @@ def update_metric(
|
|
202 |
show_revision_and_timestamp
|
203 |
)
|
204 |
elif task == "long-doc":
|
205 |
-
leaderboard_df = get_leaderboard_df(
|
|
|
206 |
return update_table_long_doc(
|
|
|
207 |
leaderboard_df,
|
208 |
domains,
|
209 |
langs,
|
@@ -321,17 +357,20 @@ def reset_rank(df):
|
|
321 |
return df
|
322 |
|
323 |
|
324 |
-
def get_leaderboard_df(
|
325 |
"""
|
326 |
Creates a dataframe from all the individual experiment results
|
327 |
"""
|
|
|
328 |
cols = [COL_NAME_IS_ANONYMOUS, ]
|
329 |
if task == "qa":
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
|
|
|
|
335 |
else:
|
336 |
raise NotImplemented
|
337 |
all_data_json = []
|
@@ -366,6 +405,7 @@ def set_listeners(
|
|
366 |
target_df,
|
367 |
source_df,
|
368 |
search_bar,
|
|
|
369 |
selected_domains,
|
370 |
selected_langs,
|
371 |
selected_rerankings,
|
@@ -385,11 +425,27 @@ def set_listeners(
|
|
385 |
search_bar,
|
386 |
show_anonymous
|
387 |
]
|
388 |
-
search_bar_args = [source_df,] + selector_list
|
389 |
-
selector_args =
|
390 |
# Set search_bar listener
|
391 |
search_bar.submit(update_table_func, search_bar_args, target_df)
|
392 |
|
393 |
# Set column-wise listener
|
394 |
for selector in selector_list:
|
395 |
selector.change(update_table_func, selector_args, target_df, queue=True,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import hashlib
|
3 |
from datetime import datetime, timezone
|
4 |
from pathlib import Path
|
|
|
5 |
|
6 |
import pandas as pd
|
7 |
|
8 |
from src.benchmarks import QABenchmarks, LongDocBenchmarks
|
9 |
from src.display.formatting import styled_message, styled_error
|
10 |
+
from src.display.columns import get_default_col_names_and_types, get_fixed_col_names_and_types
|
11 |
+
from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION, COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, \
|
12 |
+
COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
|
|
|
|
13 |
|
14 |
import re
|
15 |
|
|
|
59 |
return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
|
60 |
|
61 |
|
62 |
+
def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
|
63 |
cols = []
|
64 |
types = []
|
65 |
if task == "qa":
|
66 |
+
benchmarks = QABenchmarks[version_slug]
|
67 |
+
cols_list, types_list = get_default_col_names_and_types(benchmarks)
|
68 |
+
# cols_list = COLS_QA
|
69 |
+
# types_list = TYPES_QA
|
70 |
+
benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
|
|
|
|
|
71 |
else:
|
72 |
raise NotImplemented
|
73 |
for col_name, col_type in zip(cols_list, types_list):
|
74 |
if col_name not in benchmark_list:
|
75 |
continue
|
|
|
|
|
76 |
cols.append(col_name)
|
77 |
types.append(col_type)
|
78 |
|
79 |
if add_fix_cols:
|
80 |
_cols = []
|
81 |
_types = []
|
82 |
+
fixed_cols, fixed_cols_types = get_fixed_col_names_and_types()
|
83 |
for col_name, col_type in zip(cols, types):
|
84 |
+
if col_name in fixed_cols:
|
85 |
continue
|
86 |
_cols.append(col_name)
|
87 |
_types.append(col_type)
|
88 |
+
cols = fixed_cols + _cols
|
89 |
+
types = fixed_cols_types + _types
|
90 |
return cols, types
|
91 |
|
92 |
|
93 |
+
# def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
94 |
+
# cols = []
|
95 |
+
# types = []
|
96 |
+
# if task == "qa":
|
97 |
+
# cols_list = COLS_QA
|
98 |
+
# types_list = TYPES_QA
|
99 |
+
# benchmark_list = [c.value.col_name for c in list(QABenchmarks)]
|
100 |
+
# elif task == "long-doc":
|
101 |
+
# cols_list = COLS_LONG_DOC
|
102 |
+
# types_list = TYPES_LONG_DOC
|
103 |
+
# benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
|
104 |
+
# else:
|
105 |
+
# raise NotImplemented
|
106 |
+
# for col_name, col_type in zip(cols_list, types_list):
|
107 |
+
# if col_name not in benchmark_list:
|
108 |
+
# continue
|
109 |
+
# if len(columns) > 0 and col_name not in columns:
|
110 |
+
# continue
|
111 |
+
# cols.append(col_name)
|
112 |
+
# types.append(col_type)
|
113 |
+
#
|
114 |
+
# if add_fix_cols:
|
115 |
+
# _cols = []
|
116 |
+
# _types = []
|
117 |
+
# for col_name, col_type in zip(cols, types):
|
118 |
+
# if col_name in FIXED_COLS:
|
119 |
+
# continue
|
120 |
+
# _cols.append(col_name)
|
121 |
+
# _types.append(col_type)
|
122 |
+
# cols = FIXED_COLS + _cols
|
123 |
+
# types = FIXED_COLS_TYPES + _types
|
124 |
+
# return cols, types
|
125 |
+
|
126 |
+
|
127 |
def select_columns(
|
128 |
df: pd.DataFrame,
|
129 |
domain_query: list,
|
130 |
language_query: list,
|
131 |
task: str = "qa",
|
132 |
+
reset_ranking: bool = True,
|
133 |
+
version_slug: str = None
|
134 |
) -> pd.DataFrame:
|
135 |
+
cols, _ = get_default_cols(task=task, version_slug=version_slug, add_fix_cols=False)
|
136 |
selected_cols = []
|
137 |
for c in cols:
|
138 |
if task == "qa":
|
139 |
+
eval_col = QABenchmarks[version_slug].value[c].value
|
140 |
elif task == "long-doc":
|
141 |
+
eval_col = LongDocBenchmarks[version_slug].value[c].value
|
142 |
if eval_col.domain not in domain_query:
|
143 |
continue
|
144 |
if eval_col.lang not in language_query:
|
145 |
continue
|
146 |
selected_cols.append(c)
|
147 |
# We use COLS to maintain sorting
|
148 |
+
fixed_cols, _ = get_fixed_col_names_and_types()
|
149 |
+
filtered_df = df[fixed_cols + selected_cols]
|
150 |
+
filtered_df.replace({"": pd.NA}, inplace=True)
|
151 |
if reset_ranking:
|
152 |
filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
|
153 |
filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
|
|
|
156 |
|
157 |
return filtered_df
|
158 |
|
159 |
+
def get_safe_name(name: str):
|
160 |
+
"""Get RFC 1123 compatible safe name"""
|
161 |
+
name = name.replace('-', '_')
|
162 |
+
return ''.join(
|
163 |
+
character.lower()
|
164 |
+
for character in name
|
165 |
+
if (character.isalnum() or character == '_'))
|
166 |
|
167 |
def _update_table(
|
168 |
task: str,
|
169 |
+
version: str,
|
170 |
hidden_df: pd.DataFrame,
|
171 |
domains: list,
|
172 |
langs: list,
|
|
|
176 |
reset_ranking: bool = True,
|
177 |
show_revision_and_timestamp: bool = False
|
178 |
):
|
179 |
+
version_slug = get_safe_name(version)[-4:]
|
180 |
+
if isinstance(hidden_df, str):
|
181 |
+
print(f"task: {task}")
|
182 |
+
print(f"version: {version}")
|
183 |
+
print(f"hidden_df is a string: {hidden_df}")
|
184 |
filtered_df = hidden_df.copy()
|
185 |
if not show_anonymous:
|
186 |
filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
|
187 |
filtered_df = filter_models(filtered_df, reranking_query)
|
188 |
filtered_df = filter_queries(query, filtered_df)
|
189 |
+
filtered_df = select_columns(filtered_df, domains, langs, task, reset_ranking, version_slug)
|
190 |
if not show_revision_and_timestamp:
|
191 |
filtered_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
|
192 |
return filtered_df
|
193 |
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
def update_table_long_doc(
|
196 |
+
version: str,
|
197 |
hidden_df: pd.DataFrame,
|
198 |
domains: list,
|
199 |
langs: list,
|
|
|
206 |
):
|
207 |
return _update_table(
|
208 |
"long-doc",
|
209 |
+
version,
|
210 |
hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
|
211 |
|
212 |
|
213 |
def update_metric(
|
214 |
+
datastore,
|
215 |
task: str,
|
216 |
metric: str,
|
217 |
domains: list,
|
|
|
221 |
show_anonymous: bool = False,
|
222 |
show_revision_and_timestamp: bool = False,
|
223 |
) -> pd.DataFrame:
|
224 |
+
# raw_data = datastore.raw_data
|
225 |
if task == 'qa':
|
226 |
+
leaderboard_df = get_leaderboard_df(datastore, task=task, metric=metric)
|
227 |
+
version = datastore.version
|
228 |
return update_table(
|
229 |
+
version,
|
230 |
leaderboard_df,
|
231 |
domains,
|
232 |
langs,
|
|
|
236 |
show_revision_and_timestamp
|
237 |
)
|
238 |
elif task == "long-doc":
|
239 |
+
leaderboard_df = get_leaderboard_df(datastore, task=task, metric=metric)
|
240 |
+
version = datastore.version
|
241 |
return update_table_long_doc(
|
242 |
+
version,
|
243 |
leaderboard_df,
|
244 |
domains,
|
245 |
langs,
|
|
|
357 |
return df
|
358 |
|
359 |
|
360 |
+
def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
|
361 |
"""
|
362 |
Creates a dataframe from all the individual experiment results
|
363 |
"""
|
364 |
+
raw_data = datastore.raw_data
|
365 |
cols = [COL_NAME_IS_ANONYMOUS, ]
|
366 |
if task == "qa":
|
367 |
+
benchmarks = QABenchmarks[datastore.slug]
|
368 |
+
cols_qa, _ = get_default_col_names_and_types(benchmarks)
|
369 |
+
cols += cols_qa
|
370 |
+
benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
|
371 |
+
# elif task == "long-doc":
|
372 |
+
# cols += COLS_LONG_DOC
|
373 |
+
# benchmark_cols = [t.value.col_name for t in LongDocBenchmarks]
|
374 |
else:
|
375 |
raise NotImplemented
|
376 |
all_data_json = []
|
|
|
405 |
target_df,
|
406 |
source_df,
|
407 |
search_bar,
|
408 |
+
version,
|
409 |
selected_domains,
|
410 |
selected_langs,
|
411 |
selected_rerankings,
|
|
|
425 |
search_bar,
|
426 |
show_anonymous
|
427 |
]
|
428 |
+
search_bar_args = [source_df, version,] + selector_list
|
429 |
+
selector_args = [version, source_df] + selector_list + [show_revision_and_timestamp,]
|
430 |
# Set search_bar listener
|
431 |
search_bar.submit(update_table_func, search_bar_args, target_df)
|
432 |
|
433 |
# Set column-wise listener
|
434 |
for selector in selector_list:
|
435 |
selector.change(update_table_func, selector_args, target_df, queue=True,)
|
436 |
+
|
437 |
+
def update_table(
|
438 |
+
version: str,
|
439 |
+
hidden_df: pd.DataFrame,
|
440 |
+
domains: list,
|
441 |
+
langs: list,
|
442 |
+
reranking_query: list,
|
443 |
+
query: str,
|
444 |
+
show_anonymous: bool,
|
445 |
+
show_revision_and_timestamp: bool = False,
|
446 |
+
reset_ranking: bool = True,
|
447 |
+
):
|
448 |
+
return _update_table(
|
449 |
+
"qa",
|
450 |
+
version,
|
451 |
+
hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
|
tests/src/test_benchmarks.py
CHANGED
@@ -2,9 +2,14 @@ from src.benchmarks import QABenchmarks, LongDocBenchmarks
|
|
2 |
|
3 |
|
4 |
def test_qabenchmarks():
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
def test_longdocbenchmarks():
|
|
|
2 |
|
3 |
|
4 |
def test_qabenchmarks():
|
5 |
+
for benchmark_list in list(QABenchmarks):
|
6 |
+
print(benchmark_list.name)
|
7 |
+
for b in list(benchmark_list.value):
|
8 |
+
print(b)
|
9 |
+
qa_benchmarks = QABenchmarks["2404"]
|
10 |
+
l = list(frozenset([c.value.domain for c in list(qa_benchmarks.value)]))
|
11 |
+
print(l)
|
12 |
+
|
13 |
|
14 |
|
15 |
def test_longdocbenchmarks():
|
tests/test_utils.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
import pandas as pd
|
2 |
import pytest
|
3 |
|
4 |
-
from src.utils import filter_models, search_table, filter_queries, select_columns, update_table_long_doc, get_iso_format_timestamp, get_default_cols
|
5 |
-
from
|
6 |
-
|
|
|
7 |
|
8 |
|
9 |
@pytest.fixture
|
|
|
1 |
import pandas as pd
|
2 |
import pytest
|
3 |
|
4 |
+
from src.utils import filter_models, search_table, filter_queries, select_columns, update_table_long_doc, get_iso_format_timestamp, get_default_cols
|
5 |
+
from app import update_table
|
6 |
+
from src.envs import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, \
|
7 |
+
COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
|
8 |
|
9 |
|
10 |
@pytest.fixture
|