Lucas Georges Gabriel Charpentier Pacheco
Adding hidden tasks and multimodal
7a8dc86
import json
import gzip
import gradio as gr
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter, SearchColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
BENCHMARK_COLS_MULTIMODAL,
COLS,
COLS_MULTIMODAL,
EVAL_COLS,
AutoEvalColumn,
AutoEvalColumnMultimodal,
fields,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, EVAL_DATASETS_PATH, DATASETS_REPO
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
def restart_space():
API.restart_space(repo_id=REPO_ID)
# Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_DATASETS_PATH)
snapshot_download(
repo_id=DATASETS_REPO, local_dir=EVAL_DATASETS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe, track):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
# filter for correct track
dataframe = dataframe.loc[dataframe["Track"] == track]
if track != "multimodal":
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column=AutoEvalColumn.model.name,
placeholder="Search by model name. Seperate multiple queries with ';'.",
label="Search",
secondary_columns=["Base Architecture"]
),
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
bool_checkboxgroup_label="Hide models",
interactive=False,
filter_columns=[
ColumnFilter("Model Type", type="checkboxgroup", label="Model Type"),
ColumnFilter("Base Architecture", label="Base Architecture"),
ColumnFilter("Main Contributions", type="dropdown", label="Main Contributions"),
ColumnFilter("Optimizer", type="checkboxgroup", label="Optimizer"),
ColumnFilter("Tokenizer", type="checkboxgroup", label="Tokenizer"),
ColumnFilter("Training Dataset", type="checkboxgroup", label="Training Data"),
ColumnFilter("Learning Rate", type="slider", label="Learning Rate"),
ColumnFilter("Batch Size", type="slider", label="Batch Size"),
ColumnFilter("Total Number of Parameters (M)", type="slider", label="Total Number of Parameters (M)"),
ColumnFilter("Total Training PFLOPS", type="slider", label="Total Training PFLOPS"),
ColumnFilter("Number of Words in Dataset (M)", type="slider", label="Number of Words in Dataset (M)"),
],
wrap=True,
height=1500,
min_width=250
)
else:
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumnMultimodal)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumnMultimodal) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumnMultimodal) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column=AutoEvalColumnMultimodal.model.name,
placeholder="Search by model name. Seperate multiple queries with ';'.",
label="Search",
secondary_columns=["Base Architecture"]
),
hide_columns=[c.name for c in fields(AutoEvalColumnMultimodal) if c.hidden],
bool_checkboxgroup_label="Hide models",
interactive=False,
filter_columns=[
ColumnFilter("Model Type", type="checkboxgroup", label="Model Type"),
ColumnFilter("Base Architecture", label="Base Architecture"),
ColumnFilter("Main Contributions", type="dropdown", label="Main Contributions"),
ColumnFilter("Optimizer", type="checkboxgroup", label="Optimizer"),
ColumnFilter("Tokenizer", type="checkboxgroup", label="Tokenizer"),
ColumnFilter("Training Dataset", type="checkboxgroup", label="Training Data"),
ColumnFilter("Learning Rate", type="slider", label="Learning Rate"),
ColumnFilter("Batch Size", type="slider", label="Batch Size"),
ColumnFilter("Total Number of Parameters (M)", type="slider", label="Total Number of Parameters (M)"),
ColumnFilter("Total Training PFLOPS", type="slider", label="Total Training PFLOPS"),
ColumnFilter("Number of Words in Dataset (M)", type="slider", label="Number of Words in Dataset (M)"),
],
wrap=True,
height=1500,
min_width=250
)
def process_json(temp_file):
if temp_file is None:
return {}
# Handle file upload
try:
file_path = temp_file.name
if file_path.endswith('.gz'):
with gzip.open(file_path, 'rt') as f:
data = json.load(f)
else:
with open(file_path, 'r') as f:
data = json.load(f)
except Exception as e:
raise gr.Error(f"Error processing file: {str(e)}")
gr.Markdown("Upload successful!")
return data
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
with gr.TabItem("Interaction", elem_id="interaction-benchmark-tab-table", id=3):
leaderboard = init_leaderboard(LEADERBOARD_DF, "interaction")
with gr.TabItem("๐Ÿ‘ถ Submit", elem_id="llm-benchmark-tab-table", id=4):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# โœ‰๏ธโœจ Submit your results here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="โš ๏ธ Model name (Unique name of the model, does not have to correspond to the HuggingFace repo name)", placeholder="baseline-10m-gpt-bert-mixed (causal)")
revision_name_textbox = gr.Textbox(label="๐Ÿ”น Revision commit (main by default)", placeholder="main")
approaches = gr.Dropdown(
choices=[
"Architectural innovations",
"Curriculum learning",
"Data augmentation",
"Data preprocessing",
"Hyperparameter tuning",
"Linguistic bias",
"Multimodality",
"Teacher/expert/auxiliary models",
"Training objective innovations",
"Dataset creation",
"Controlled experiments",
"Evaluation methods",
],
label="๐Ÿ‘ถ Main contributions/approaches, if not in the list, type your choice. Multiple selection allowed (optional if not submitting to challenge)",
allow_custom_value=True,
multiselect=True,
interactive=True,
filterable=True,
)
base_model = gr.Dropdown(
choices=[
"GPT-2",
"Llama",
"BERT",
"T5",
"RoBERTa",
"DeBERTa",
"LTG-BERT",
"LSTM",
],
label="๐Ÿ‘ถ Base architecture, if not in the list, type your choice (optional if not submitting to challenge)",
allow_custom_value=True,
multiselect=False,
interactive=True,
filterable=True,
)
learning_rate_scheduler = gr.Textbox(label="๐Ÿ‘ถ Learning rate scheduler (optional if not submitting to challenge)")
epochs = gr.Number(label="๐Ÿ‘ถ Number of training epochs (optional if not submitting to challenge)", precision=0)
tokenizer = gr.Textbox(label="๐Ÿ‘ถ Tokenizer (optional if not submitting to challenge)")
random_seed = gr.Textbox(label="๐Ÿ‘ถ Random Seed (optional if not submitting to challenge)")
num_heads = gr.Number(label="๐Ÿ‘ถ Number of attention heads (optional if not submitting to challenge). If attention is not used put -1.", precision=0)
max_seq_len = gr.Number(label="๐Ÿ‘ถ Max sequence length (optional if not submitting to challenge)", precision=0)
gpu_dev = gr.Number(label="๐Ÿ‘ถ Approximate GPU hours for development (optional if not submitting to challenge)", precision=0)
training_data = gr.Dropdown(
choices=[
"BabyLM strict",
"BabyLM strict-small",
"BabyLM multimodal",
],
label="๐Ÿ‘ถ Training data, if not in the list, type your choice (optional if not submitting to challenge)",
value="BabyLM strict",
allow_custom_value=True,
multiselect=False,
interactive=True,
filterable=True,
)
datasize = gr.Number(label="๐Ÿ‘ถ Approximate number of words for custom dataset (optional if not submitting to challenge)", precision=0)
data_genre = gr.Textbox(label="๐Ÿ‘ถ Genre of sources for cutom dataset (optional if not submitting to challenge). If one of the official BabyLM dataset is chose you do not need to fill this in.", placeholder="Movie/TV subtitles")
data_preprocessing = gr.Textbox(label="๐Ÿ‘ถ Preprocessing of custom dataset (optional if not submitting to challenge). If one of the official BabyLM dataset is chose you do not need to fill this in.", placeholder="We removed documents with non-English strings. Documents were seperated with newlines.", lines=3)
results_file = gr.File(label="โš ๏ธ Results file (JSON file)", file_types=[".json"])
with gr.Column():
hf_repo = gr.Textbox(label="๐Ÿ‘ถ HuggingFace repository (If no HF repo, please put a username to identify your submissions instead)", placeholder="BabyLM-community/babylm-baseline-10m-gpt-bert-mixed or BabyLM-community")
track = gr.Dropdown(
choices=["strict", "strict-small", "multimodal", "interaction"],
label="โš ๏ธ Track",
multiselect=False,
value="strict",
interactive=True,
filterable=True,
)
model_type = gr.Dropdown(
choices=[
"Decoder only",
"Encoder only",
"Encoder-decoder",
],
label="๐Ÿ‘ถ Model type, if not in the list, type your choice (optional if not submitting to challenge)",
allow_custom_value=True,
multiselect=False,
interactive=True,
filterable=True,
)
learning_rate = gr.Number(label="๐Ÿ‘ถ Max learning rate (optional if not submitting to challenge)")
optimizer = gr.Textbox(label="๐Ÿ‘ถ Optimizer (optional if not submitting to challenge)")
batch_size = gr.Number(label="๐Ÿ‘ถ Average batch size (in tokens) (optional if not submitting to challenge)", precision=0)
token_set_size = gr.Number(label="๐Ÿ‘ถ Token set size (optional if not submitting to challenge)", precision=0)
num_layers = gr.Number(label="๐Ÿ‘ถ Number of layers (optional if not submitting to challenge)", precision=0)
total_parameters = gr.Number(label="๐Ÿ‘ถ Total number of parameters (optional if not submitting to challenge)", precision=0)
flops = gr.Number(label="๐Ÿ‘ถ Approximate number of training FLOPS (optional if not submitting to challenge)", precision=0)
gpu_train = gr.Number(label="๐Ÿ‘ถ Approximate GPU hours for training this model (optional if not submitting to challenge)", precision=0)
data_human = gr.Dropdown(
choices=[
"Not applicable",
"No",
],
label="๐Ÿ‘ถ Custom data human annotation, type your choice if applicable. Example: We had humans provide preference data for model generated sentences.",
value="Not applicable",
allow_custom_value=True,
filterable=True,
interactive=True
)
data_aug = gr.Dropdown(
choices=[
"Not applicable",
"No",
],
label="๐Ÿ‘ถ Custom synthetic data or data augmentation, type your choice if applicable. Example: We used a pretrained T5 model to reword sentences from the original corpus.",
value="Not applicable",
allow_custom_value=True,
filterable=True,
interactive=True
)
description = gr.Textbox(label="๐Ÿ‘ถ Brief textual description of the model (optional if not submitting to challenge)", placeholder="This is a baseline that uses the same hyperparameters as our competition entry but does not use the same curriculum learning approach.", lines=8)
other_hyp = gr.File(label="๐Ÿ”น Other hyperparameters (JSON file)", file_types=[".json"])
submit_button = gr.Button("Submit Results")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
revision_name_textbox,
hf_repo,
track,
results_file,
model_type,
approaches,
base_model,
learning_rate_scheduler,
epochs,
tokenizer,
random_seed,
num_heads,
max_seq_len,
gpu_dev,
training_data,
datasize,
data_genre,
learning_rate,
optimizer,
batch_size,
token_set_size,
num_layers,
total_parameters,
flops,
gpu_train,
data_human,
data_preprocessing,
data_aug,
description,
other_hyp,
],
submission_result)
with gr.Row():
with gr.Accordion("๐Ÿ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.launch(share=True, ssr_mode=False)