|
import json |
|
import gzip |
|
import gradio as gr |
|
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter, SearchColumns |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from huggingface_hub import snapshot_download |
|
from src.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
EVALUATION_QUEUE_TEXT, |
|
INTRODUCTION_TEXT, |
|
TITLE, |
|
) |
|
from src.display.css_html_js import custom_css |
|
from src.display.utils import ( |
|
BENCHMARK_COLS, |
|
BENCHMARK_COLS_MULTIMODAL, |
|
COLS, |
|
COLS_MULTIMODAL, |
|
EVAL_COLS, |
|
AutoEvalColumn, |
|
AutoEvalColumnMultimodal, |
|
fields, |
|
) |
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, EVAL_DATASETS_PATH, DATASETS_REPO |
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df |
|
from src.submission.submit import add_new_eval |
|
|
|
|
|
def restart_space(): |
|
API.restart_space(repo_id=REPO_ID) |
|
|
|
|
|
|
|
try: |
|
print(EVAL_REQUESTS_PATH) |
|
snapshot_download( |
|
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
try: |
|
print(EVAL_RESULTS_PATH) |
|
snapshot_download( |
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
|
|
try: |
|
print(EVAL_DATASETS_PATH) |
|
snapshot_download( |
|
repo_id=DATASETS_REPO, local_dir=EVAL_DATASETS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
|
) |
|
except Exception: |
|
restart_space() |
|
|
|
|
|
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) |
|
LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL) |
|
|
|
( |
|
finished_eval_queue_df, |
|
running_eval_queue_df, |
|
pending_eval_queue_df, |
|
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) |
|
|
|
|
|
def init_leaderboard(dataframe, track): |
|
if dataframe is None or dataframe.empty: |
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
|
|
dataframe = dataframe.loc[dataframe["Track"] == track] |
|
if track != "multimodal": |
|
return Leaderboard( |
|
value=dataframe, |
|
datatype=[c.type for c in fields(AutoEvalColumn)], |
|
select_columns=SelectColumns( |
|
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], |
|
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], |
|
label="Select Columns to Display:", |
|
), |
|
search_columns=SearchColumns( |
|
primary_column=AutoEvalColumn.model.name, |
|
placeholder="Search by model name. Seperate multiple queries with ';'.", |
|
label="Search", |
|
secondary_columns=["Base Architecture"] |
|
), |
|
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], |
|
bool_checkboxgroup_label="Hide models", |
|
interactive=False, |
|
filter_columns=[ |
|
ColumnFilter("Model Type", type="checkboxgroup", label="Model Type"), |
|
ColumnFilter("Base Architecture", label="Base Architecture"), |
|
ColumnFilter("Main Contributions", type="dropdown", label="Main Contributions"), |
|
ColumnFilter("Optimizer", type="checkboxgroup", label="Optimizer"), |
|
ColumnFilter("Tokenizer", type="checkboxgroup", label="Tokenizer"), |
|
ColumnFilter("Training Dataset", type="checkboxgroup", label="Training Data"), |
|
ColumnFilter("Learning Rate", type="slider", label="Learning Rate"), |
|
ColumnFilter("Batch Size", type="slider", label="Batch Size"), |
|
ColumnFilter("Total Number of Parameters (M)", type="slider", label="Total Number of Parameters (M)"), |
|
ColumnFilter("Total Training PFLOPS", type="slider", label="Total Training PFLOPS"), |
|
ColumnFilter("Number of Words in Dataset (M)", type="slider", label="Number of Words in Dataset (M)"), |
|
], |
|
wrap=True, |
|
height=1500, |
|
min_width=250 |
|
) |
|
else: |
|
return Leaderboard( |
|
value=dataframe, |
|
datatype=[c.type for c in fields(AutoEvalColumnMultimodal)], |
|
select_columns=SelectColumns( |
|
default_selection=[c.name for c in fields(AutoEvalColumnMultimodal) if c.displayed_by_default], |
|
cant_deselect=[c.name for c in fields(AutoEvalColumnMultimodal) if c.never_hidden], |
|
label="Select Columns to Display:", |
|
), |
|
search_columns=SearchColumns( |
|
primary_column=AutoEvalColumnMultimodal.model.name, |
|
placeholder="Search by model name. Seperate multiple queries with ';'.", |
|
label="Search", |
|
secondary_columns=["Base Architecture"] |
|
), |
|
hide_columns=[c.name for c in fields(AutoEvalColumnMultimodal) if c.hidden], |
|
bool_checkboxgroup_label="Hide models", |
|
interactive=False, |
|
filter_columns=[ |
|
ColumnFilter("Model Type", type="checkboxgroup", label="Model Type"), |
|
ColumnFilter("Base Architecture", label="Base Architecture"), |
|
ColumnFilter("Main Contributions", type="dropdown", label="Main Contributions"), |
|
ColumnFilter("Optimizer", type="checkboxgroup", label="Optimizer"), |
|
ColumnFilter("Tokenizer", type="checkboxgroup", label="Tokenizer"), |
|
ColumnFilter("Training Dataset", type="checkboxgroup", label="Training Data"), |
|
ColumnFilter("Learning Rate", type="slider", label="Learning Rate"), |
|
ColumnFilter("Batch Size", type="slider", label="Batch Size"), |
|
ColumnFilter("Total Number of Parameters (M)", type="slider", label="Total Number of Parameters (M)"), |
|
ColumnFilter("Total Training PFLOPS", type="slider", label="Total Training PFLOPS"), |
|
ColumnFilter("Number of Words in Dataset (M)", type="slider", label="Number of Words in Dataset (M)"), |
|
], |
|
wrap=True, |
|
height=1500, |
|
min_width=250 |
|
) |
|
|
|
|
|
def process_json(temp_file): |
|
if temp_file is None: |
|
return {} |
|
|
|
|
|
try: |
|
file_path = temp_file.name |
|
if file_path.endswith('.gz'): |
|
with gzip.open(file_path, 'rt') as f: |
|
data = json.load(f) |
|
else: |
|
with open(file_path, 'r') as f: |
|
data = json.load(f) |
|
except Exception as e: |
|
raise gr.Error(f"Error processing file: {str(e)}") |
|
|
|
gr.Markdown("Upload successful!") |
|
return data |
|
|
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict") |
|
with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small") |
|
with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal") |
|
with gr.TabItem("Interaction", elem_id="interaction-benchmark-tab-table", id=3): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF, "interaction") |
|
|
|
with gr.TabItem("๐ถ Submit", elem_id="llm-benchmark-tab-table", id=4): |
|
with gr.Column(): |
|
with gr.Row(): |
|
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
gr.Markdown("# โ๏ธโจ Submit your results here!", elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="โ ๏ธ Model name (Unique name of the model, does not have to correspond to the HuggingFace repo name)", placeholder="baseline-10m-gpt-bert-mixed (causal)") |
|
revision_name_textbox = gr.Textbox(label="๐น Revision commit (main by default)", placeholder="main") |
|
approaches = gr.Dropdown( |
|
choices=[ |
|
"Architectural innovations", |
|
"Curriculum learning", |
|
"Data augmentation", |
|
"Data preprocessing", |
|
"Hyperparameter tuning", |
|
"Linguistic bias", |
|
"Multimodality", |
|
"Teacher/expert/auxiliary models", |
|
"Training objective innovations", |
|
"Dataset creation", |
|
"Controlled experiments", |
|
"Evaluation methods", |
|
], |
|
label="๐ถ Main contributions/approaches, if not in the list, type your choice. Multiple selection allowed (optional if not submitting to challenge)", |
|
allow_custom_value=True, |
|
multiselect=True, |
|
interactive=True, |
|
filterable=True, |
|
) |
|
base_model = gr.Dropdown( |
|
choices=[ |
|
"GPT-2", |
|
"Llama", |
|
"BERT", |
|
"T5", |
|
"RoBERTa", |
|
"DeBERTa", |
|
"LTG-BERT", |
|
"LSTM", |
|
], |
|
label="๐ถ Base architecture, if not in the list, type your choice (optional if not submitting to challenge)", |
|
allow_custom_value=True, |
|
multiselect=False, |
|
interactive=True, |
|
filterable=True, |
|
) |
|
learning_rate_scheduler = gr.Textbox(label="๐ถ Learning rate scheduler (optional if not submitting to challenge)") |
|
epochs = gr.Number(label="๐ถ Number of training epochs (optional if not submitting to challenge)", precision=0) |
|
tokenizer = gr.Textbox(label="๐ถ Tokenizer (optional if not submitting to challenge)") |
|
random_seed = gr.Textbox(label="๐ถ Random Seed (optional if not submitting to challenge)") |
|
num_heads = gr.Number(label="๐ถ Number of attention heads (optional if not submitting to challenge). If attention is not used put -1.", precision=0) |
|
max_seq_len = gr.Number(label="๐ถ Max sequence length (optional if not submitting to challenge)", precision=0) |
|
gpu_dev = gr.Number(label="๐ถ Approximate GPU hours for development (optional if not submitting to challenge)", precision=0) |
|
training_data = gr.Dropdown( |
|
choices=[ |
|
"BabyLM strict", |
|
"BabyLM strict-small", |
|
"BabyLM multimodal", |
|
], |
|
label="๐ถ Training data, if not in the list, type your choice (optional if not submitting to challenge)", |
|
value="BabyLM strict", |
|
allow_custom_value=True, |
|
multiselect=False, |
|
interactive=True, |
|
filterable=True, |
|
) |
|
datasize = gr.Number(label="๐ถ Approximate number of words for custom dataset (optional if not submitting to challenge)", precision=0) |
|
data_genre = gr.Textbox(label="๐ถ Genre of sources for cutom dataset (optional if not submitting to challenge). If one of the official BabyLM dataset is chose you do not need to fill this in.", placeholder="Movie/TV subtitles") |
|
data_preprocessing = gr.Textbox(label="๐ถ Preprocessing of custom dataset (optional if not submitting to challenge). If one of the official BabyLM dataset is chose you do not need to fill this in.", placeholder="We removed documents with non-English strings. Documents were seperated with newlines.", lines=3) |
|
|
|
results_file = gr.File(label="โ ๏ธ Results file (JSON file)", file_types=[".json"]) |
|
|
|
with gr.Column(): |
|
hf_repo = gr.Textbox(label="๐ถ HuggingFace repository (If no HF repo, please put a username to identify your submissions instead)", placeholder="BabyLM-community/babylm-baseline-10m-gpt-bert-mixed or BabyLM-community") |
|
track = gr.Dropdown( |
|
choices=["strict", "strict-small", "multimodal", "interaction"], |
|
label="โ ๏ธ Track", |
|
multiselect=False, |
|
value="strict", |
|
interactive=True, |
|
filterable=True, |
|
) |
|
model_type = gr.Dropdown( |
|
choices=[ |
|
"Decoder only", |
|
"Encoder only", |
|
"Encoder-decoder", |
|
], |
|
label="๐ถ Model type, if not in the list, type your choice (optional if not submitting to challenge)", |
|
allow_custom_value=True, |
|
multiselect=False, |
|
interactive=True, |
|
filterable=True, |
|
) |
|
learning_rate = gr.Number(label="๐ถ Max learning rate (optional if not submitting to challenge)") |
|
optimizer = gr.Textbox(label="๐ถ Optimizer (optional if not submitting to challenge)") |
|
batch_size = gr.Number(label="๐ถ Average batch size (in tokens) (optional if not submitting to challenge)", precision=0) |
|
token_set_size = gr.Number(label="๐ถ Token set size (optional if not submitting to challenge)", precision=0) |
|
num_layers = gr.Number(label="๐ถ Number of layers (optional if not submitting to challenge)", precision=0) |
|
total_parameters = gr.Number(label="๐ถ Total number of parameters (optional if not submitting to challenge)", precision=0) |
|
flops = gr.Number(label="๐ถ Approximate number of training FLOPS (optional if not submitting to challenge)", precision=0) |
|
gpu_train = gr.Number(label="๐ถ Approximate GPU hours for training this model (optional if not submitting to challenge)", precision=0) |
|
data_human = gr.Dropdown( |
|
choices=[ |
|
"Not applicable", |
|
"No", |
|
], |
|
label="๐ถ Custom data human annotation, type your choice if applicable. Example: We had humans provide preference data for model generated sentences.", |
|
value="Not applicable", |
|
allow_custom_value=True, |
|
filterable=True, |
|
interactive=True |
|
) |
|
data_aug = gr.Dropdown( |
|
choices=[ |
|
"Not applicable", |
|
"No", |
|
], |
|
label="๐ถ Custom synthetic data or data augmentation, type your choice if applicable. Example: We used a pretrained T5 model to reword sentences from the original corpus.", |
|
value="Not applicable", |
|
allow_custom_value=True, |
|
filterable=True, |
|
interactive=True |
|
) |
|
|
|
description = gr.Textbox(label="๐ถ Brief textual description of the model (optional if not submitting to challenge)", placeholder="This is a baseline that uses the same hyperparameters as our competition entry but does not use the same curriculum learning approach.", lines=8) |
|
|
|
other_hyp = gr.File(label="๐น Other hyperparameters (JSON file)", file_types=[".json"]) |
|
|
|
submit_button = gr.Button("Submit Results") |
|
submission_result = gr.Markdown() |
|
submit_button.click( |
|
add_new_eval, |
|
[ |
|
model_name_textbox, |
|
revision_name_textbox, |
|
hf_repo, |
|
track, |
|
results_file, |
|
model_type, |
|
approaches, |
|
base_model, |
|
learning_rate_scheduler, |
|
epochs, |
|
tokenizer, |
|
random_seed, |
|
num_heads, |
|
max_seq_len, |
|
gpu_dev, |
|
training_data, |
|
datasize, |
|
data_genre, |
|
learning_rate, |
|
optimizer, |
|
batch_size, |
|
token_set_size, |
|
num_layers, |
|
total_parameters, |
|
flops, |
|
gpu_train, |
|
data_human, |
|
data_preprocessing, |
|
data_aug, |
|
description, |
|
other_hyp, |
|
], |
|
submission_result) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("๐ Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=20, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=1800) |
|
scheduler.start() |
|
demo.launch(share=True, ssr_mode=False) |
|
|