|
import json |
|
import os |
|
import time |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
from datasets import load_dataset,get_dataset_config_names |
|
from src.display.formatting import benchmark_version_hyperlink, leaderboard_version_hyperlink |
|
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns |
|
|
|
from src.about import ( |
|
ABOUT_TEXT, |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
TITLE, |
|
TITLE_MARKDOWN_DESCRIPTION, |
|
) |
|
from src.display.utils import ModelType, Precision, WeightType |
|
from src.envs import API, BENCHMARK_REPO, REPO_ID |
|
from src.populate import create_leaderboard_df, get_sorted_versions |
|
from src.submission.check_validity import ( |
|
validate_report_format, |
|
validate_results_coverage, |
|
validate_results_structure, |
|
) |
|
from src.submission.submit import update_dataset_with_scores |
|
|
|
STATIC_DIR = str(Path(__file__).parent / "src" / "static") |
|
|
|
|
|
current_leaderboard_df = None |
|
|
|
|
|
def initialize_leaderboard(): |
|
"""Initialize the global leaderboard DataFrame""" |
|
global current_leaderboard_df |
|
current_leaderboard_df = create_leaderboard_df() |
|
return current_leaderboard_df |
|
|
|
|
|
def process_submission( |
|
model_name: str, |
|
base_model: str, |
|
revision: str, |
|
precision: str, |
|
weight_type: str, |
|
model_type: str, |
|
results_file: Path, |
|
params: float, |
|
is_reasoning: bool, |
|
is_moe: bool, |
|
progress=gr.Progress(), |
|
) -> tuple[str, dict]: |
|
"""Processes model submission and updates the leaderboard with visual progress tracking.""" |
|
try: |
|
|
|
progress(0, desc="Starting...") |
|
|
|
|
|
with open(results_file.name, "r") as f: |
|
results = json.load(f) |
|
|
|
|
|
dataset_configs = get_dataset_config_names(BENCHMARK_REPO) |
|
|
|
|
|
steps = progress.tqdm( |
|
[ |
|
("Validating structure", validate_results_structure, (results,)), |
|
( |
|
"Checking coverage", |
|
validate_results_coverage, |
|
(results, dataset_configs), |
|
), |
|
("Validating format", validate_report_format, (results,)), |
|
], |
|
desc="Processing submission...", |
|
) |
|
|
|
for desc, func, args in steps: |
|
time.sleep(0.5) |
|
if not func(*args): |
|
return f"β Error during {desc.lower()}" |
|
|
|
|
|
progress(0.8, desc="Preparing metadata...") |
|
meta_info = { |
|
"model_id": f"{model_name}-{revision}", |
|
"name": model_name, |
|
"is_open_source": model_type == "open_source : Open Source Model", |
|
|
|
"is_reasoning": is_reasoning, |
|
|
|
} |
|
|
|
|
|
progress(0.9, desc="Updating leaderboard...") |
|
update_dataset_with_scores(meta_info, results, dataset_configs) |
|
|
|
progress(1.0, desc="Done!") |
|
return "β
Successfully validated results and updated leaderboard!" |
|
|
|
except Exception as e: |
|
return f"β Error: {str(e)}" |
|
|
|
def get_benchmark_version(): |
|
"""Get the current benchmark dataset version""" |
|
try: |
|
config = get_dataset_config_names(BENCHMARK_REPO) |
|
|
|
_benchmark_dataset = load_dataset(BENCHMARK_REPO,config[0]) |
|
version = get_sorted_versions(_benchmark_dataset) |
|
if version == 'train': |
|
version = '1.0.0' |
|
return '1.0.0' |
|
except Exception: |
|
return "Unknown" |
|
|
|
|
|
|
|
demo = gr.Blocks().queue() |
|
demo.static_dir = STATIC_DIR |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(TITLE_MARKDOWN_DESCRIPTION) |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.TabItem("π
Leaderboard", elem_id="model-leaderboard", id=0): |
|
|
|
initial_df, available_versions = create_leaderboard_df() |
|
benchmark_version = get_benchmark_version() |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
version_dropdown = gr.Dropdown( |
|
choices=available_versions, |
|
value=available_versions[0], |
|
label="Leaderboard Version", |
|
interactive=True |
|
) |
|
with gr.Column(scale=1): |
|
gr.HTML(benchmark_version_hyperlink(benchmark_version)) |
|
gr.HTML(leaderboard_version_hyperlink(available_versions[0])) |
|
|
|
|
|
|
|
column_types = { |
|
"Model": "str", |
|
"Overall Score": "number", |
|
"Open Source": "bool", |
|
|
|
"Reasoning": "bool", |
|
**{ |
|
col: "number" |
|
for col in initial_df.columns |
|
if col |
|
not in [ |
|
"Model", |
|
"Overall Score", |
|
|
|
"Open Source", |
|
|
|
"Reasoning", |
|
] |
|
}, |
|
} |
|
|
|
|
|
leaderboard = Leaderboard( |
|
value=initial_df, |
|
datatype=column_types, |
|
select_columns=SelectColumns( |
|
default_selection=[ |
|
col |
|
for col in initial_df.columns |
|
if col not in ["Open Source", "Reasoning"] |
|
], |
|
cant_deselect=["Model", "Overall Score"], |
|
), |
|
search_columns=["Model"], |
|
filter_columns=[ |
|
ColumnFilter( |
|
"Open Source", |
|
type="boolean", |
|
label="Show only open source models", |
|
default=False, |
|
), |
|
|
|
|
|
|
|
|
|
|
|
|
|
ColumnFilter( |
|
"Reasoning", |
|
type="boolean", |
|
label="Show only reasoning models", |
|
default=False, |
|
), |
|
], |
|
bool_checkboxgroup_label="Apply Filters", |
|
) |
|
|
|
|
|
def update_leaderboard(version): |
|
df, _ = create_leaderboard_df(version) |
|
return df |
|
|
|
version_dropdown.change( |
|
fn=update_leaderboard, |
|
inputs=[version_dropdown], |
|
outputs=[leaderboard], |
|
) |
|
|
|
with gr.TabItem("π Submit", elem_id="submit-tab", id=1): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"## Submit your model evaluation results", |
|
elem_classes="markdown-text", |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="Model name", placeholder="e.g., GPT-Chemistry") |
|
revision_name_textbox = gr.Textbox(label="Version", placeholder="main") |
|
model_type = gr.Dropdown( |
|
choices=[str(t) for t in ModelType], |
|
label="Model type", |
|
multiselect=False, |
|
value=None, |
|
interactive=True, |
|
) |
|
with gr.Column(): |
|
precision = gr.Dropdown( |
|
choices=[str(p) for p in Precision], |
|
label="Precision", |
|
multiselect=False, |
|
value="float16", |
|
interactive=True, |
|
) |
|
weight_type = gr.Dropdown( |
|
choices=[str(w) for w in WeightType], |
|
label="Weights type", |
|
multiselect=False, |
|
value="Original", |
|
interactive=True, |
|
) |
|
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
params_number = gr.Number( |
|
label="Number of Parameters (in billions)", |
|
value=None, |
|
info="e.g., 7.0", |
|
) |
|
is_reasoning = gr.Checkbox(label="Uses reasoning/Chain-of-Thought", value=False) |
|
is_moe = gr.Checkbox(label="Is Mixture of Experts (MoE)", value=False) |
|
|
|
|
|
with gr.Row(): |
|
results_file = gr.File( |
|
label="Upload Results JSON", |
|
file_types=[".json"], |
|
) |
|
|
|
|
|
with gr.Row(equal_height=True): |
|
|
|
submit_button = gr.Button("Submit and Update Leaderboard") |
|
output = gr.Markdown(value="STATUS ... ", label="Submission Status") |
|
|
|
def handle_submission(*args): |
|
result = process_submission(*args) |
|
if "β
" in result: |
|
time.sleep(5) |
|
API.restart_space(repo_id=REPO_ID) |
|
return result |
|
|
|
submit_button.click( |
|
fn=handle_submission, |
|
inputs=[ |
|
model_name_textbox, |
|
base_model_name_textbox, |
|
revision_name_textbox, |
|
precision, |
|
weight_type, |
|
model_type, |
|
results_file, |
|
params_number, |
|
is_reasoning, |
|
is_moe, |
|
], |
|
outputs=output, |
|
show_progress=True, |
|
) |
|
with gr.TabItem("π About", elem_id="about-tab", id=2): |
|
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Accordion("π Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=20, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue() |
|
demo.launch() |
|
|