Spaces:

BabyLM-community
/

babylm-leaderboard-2025-all-tasks

Running

babylm-leaderboard-2025-all-tasks / app.py

Lucas Georges Gabriel Charpentier Pacheco

Adding hidden tasks and multimodal

7a8dc86 10 days ago

18.4 kB

	import json
	import gzip
	import gradio as gr
	from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter, SearchColumns
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download
	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	BENCHMARK_COLS_MULTIMODAL,
	COLS,
	COLS_MULTIMODAL,
	EVAL_COLS,
	AutoEvalColumn,
	AutoEvalColumnMultimodal,
	fields,
	)
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, EVAL_DATASETS_PATH, DATASETS_REPO
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import add_new_eval


	def restart_space():
	API.restart_space(repo_id=REPO_ID)


	# Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()
	try:
	print(EVAL_RESULTS_PATH)
	snapshot_download(
	repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()

	try:
	print(EVAL_DATASETS_PATH)
	snapshot_download(
	repo_id=DATASETS_REPO, local_dir=EVAL_DATASETS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()


	LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
	LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)

	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)


	def init_leaderboard(dataframe, track):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")
	# filter for correct track
	dataframe = dataframe.loc[dataframe["Track"] == track]
	if track != "multimodal":
	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=SearchColumns(
	primary_column=AutoEvalColumn.model.name,
	placeholder="Search by model name. Seperate multiple queries with ';'.",
	label="Search",
	secondary_columns=["Base Architecture"]
	),
	hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	filter_columns=[
	ColumnFilter("Model Type", type="checkboxgroup", label="Model Type"),
	ColumnFilter("Base Architecture", label="Base Architecture"),
	ColumnFilter("Main Contributions", type="dropdown", label="Main Contributions"),
	ColumnFilter("Optimizer", type="checkboxgroup", label="Optimizer"),
	ColumnFilter("Tokenizer", type="checkboxgroup", label="Tokenizer"),
	ColumnFilter("Training Dataset", type="checkboxgroup", label="Training Data"),
	ColumnFilter("Learning Rate", type="slider", label="Learning Rate"),
	ColumnFilter("Batch Size", type="slider", label="Batch Size"),
	ColumnFilter("Total Number of Parameters (M)", type="slider", label="Total Number of Parameters (M)"),
	ColumnFilter("Total Training PFLOPS", type="slider", label="Total Training PFLOPS"),
	ColumnFilter("Number of Words in Dataset (M)", type="slider", label="Number of Words in Dataset (M)"),
	],
	wrap=True,
	height=1500,
	min_width=250
	)
	else:
	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumnMultimodal)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumnMultimodal) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumnMultimodal) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=SearchColumns(
	primary_column=AutoEvalColumnMultimodal.model.name,
	placeholder="Search by model name. Seperate multiple queries with ';'.",
	label="Search",
	secondary_columns=["Base Architecture"]
	),
	hide_columns=[c.name for c in fields(AutoEvalColumnMultimodal) if c.hidden],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	filter_columns=[
	ColumnFilter("Model Type", type="checkboxgroup", label="Model Type"),
	ColumnFilter("Base Architecture", label="Base Architecture"),
	ColumnFilter("Main Contributions", type="dropdown", label="Main Contributions"),
	ColumnFilter("Optimizer", type="checkboxgroup", label="Optimizer"),
	ColumnFilter("Tokenizer", type="checkboxgroup", label="Tokenizer"),
	ColumnFilter("Training Dataset", type="checkboxgroup", label="Training Data"),
	ColumnFilter("Learning Rate", type="slider", label="Learning Rate"),
	ColumnFilter("Batch Size", type="slider", label="Batch Size"),
	ColumnFilter("Total Number of Parameters (M)", type="slider", label="Total Number of Parameters (M)"),
	ColumnFilter("Total Training PFLOPS", type="slider", label="Total Training PFLOPS"),
	ColumnFilter("Number of Words in Dataset (M)", type="slider", label="Number of Words in Dataset (M)"),
	],
	wrap=True,
	height=1500,
	min_width=250
	)


	def process_json(temp_file):
	if temp_file is None:
	return {}

	# Handle file upload
	try:
	file_path = temp_file.name
	if file_path.endswith('.gz'):
	with gzip.open(file_path, 'rt') as f:
	data = json.load(f)
	else:
	with open(file_path, 'r') as f:
	data = json.load(f)
	except Exception as e:
	raise gr.Error(f"Error processing file: {str(e)}")

	gr.Markdown("Upload successful!")
	return data


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
	leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
	with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
	leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
	with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
	leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
	with gr.TabItem("Interaction", elem_id="interaction-benchmark-tab-table", id=3):
	leaderboard = init_leaderboard(LEADERBOARD_DF, "interaction")

	with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=4):
	with gr.Column():
	with gr.Row():
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	with gr.Row():
	gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")

	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(label="⚠️ Model name (Unique name of the model, does not have to correspond to the HuggingFace repo name)", placeholder="baseline-10m-gpt-bert-mixed (causal)")
	revision_name_textbox = gr.Textbox(label="🔹 Revision commit (main by default)", placeholder="main")
	approaches = gr.Dropdown(
	choices=[
	"Architectural innovations",
	"Curriculum learning",
	"Data augmentation",
	"Data preprocessing",
	"Hyperparameter tuning",
	"Linguistic bias",
	"Multimodality",
	"Teacher/expert/auxiliary models",
	"Training objective innovations",
	"Dataset creation",
	"Controlled experiments",
	"Evaluation methods",
	],
	label="👶 Main contributions/approaches, if not in the list, type your choice. Multiple selection allowed (optional if not submitting to challenge)",
	allow_custom_value=True,
	multiselect=True,
	interactive=True,
	filterable=True,
	)
	base_model = gr.Dropdown(
	choices=[
	"GPT-2",
	"Llama",
	"BERT",
	"T5",
	"RoBERTa",
	"DeBERTa",
	"LTG-BERT",
	"LSTM",
	],
	label="👶 Base architecture, if not in the list, type your choice (optional if not submitting to challenge)",
	allow_custom_value=True,
	multiselect=False,
	interactive=True,
	filterable=True,
	)
	learning_rate_scheduler = gr.Textbox(label="👶 Learning rate scheduler (optional if not submitting to challenge)")
	epochs = gr.Number(label="👶 Number of training epochs (optional if not submitting to challenge)", precision=0)
	tokenizer = gr.Textbox(label="👶 Tokenizer (optional if not submitting to challenge)")
	random_seed = gr.Textbox(label="👶 Random Seed (optional if not submitting to challenge)")
	num_heads = gr.Number(label="👶 Number of attention heads (optional if not submitting to challenge). If attention is not used put -1.", precision=0)
	max_seq_len = gr.Number(label="👶 Max sequence length (optional if not submitting to challenge)", precision=0)
	gpu_dev = gr.Number(label="👶 Approximate GPU hours for development (optional if not submitting to challenge)", precision=0)
	training_data = gr.Dropdown(
	choices=[
	"BabyLM strict",
	"BabyLM strict-small",
	"BabyLM multimodal",
	],
	label="👶 Training data, if not in the list, type your choice (optional if not submitting to challenge)",
	value="BabyLM strict",
	allow_custom_value=True,
	multiselect=False,
	interactive=True,
	filterable=True,
	)
	datasize = gr.Number(label="👶 Approximate number of words for custom dataset (optional if not submitting to challenge)", precision=0)
	data_genre = gr.Textbox(label="👶 Genre of sources for cutom dataset (optional if not submitting to challenge). If one of the official BabyLM dataset is chose you do not need to fill this in.", placeholder="Movie/TV subtitles")
	data_preprocessing = gr.Textbox(label="👶 Preprocessing of custom dataset (optional if not submitting to challenge). If one of the official BabyLM dataset is chose you do not need to fill this in.", placeholder="We removed documents with non-English strings. Documents were seperated with newlines.", lines=3)

	results_file = gr.File(label="⚠️ Results file (JSON file)", file_types=[".json"])

	with gr.Column():
	hf_repo = gr.Textbox(label="👶 HuggingFace repository (If no HF repo, please put a username to identify your submissions instead)", placeholder="BabyLM-community/babylm-baseline-10m-gpt-bert-mixed or BabyLM-community")
	track = gr.Dropdown(
	choices=["strict", "strict-small", "multimodal", "interaction"],
	label="⚠️ Track",
	multiselect=False,
	value="strict",
	interactive=True,
	filterable=True,
	)
	model_type = gr.Dropdown(
	choices=[
	"Decoder only",
	"Encoder only",
	"Encoder-decoder",
	],
	label="👶 Model type, if not in the list, type your choice (optional if not submitting to challenge)",
	allow_custom_value=True,
	multiselect=False,
	interactive=True,
	filterable=True,
	)
	learning_rate = gr.Number(label="👶 Max learning rate (optional if not submitting to challenge)")
	optimizer = gr.Textbox(label="👶 Optimizer (optional if not submitting to challenge)")
	batch_size = gr.Number(label="👶 Average batch size (in tokens) (optional if not submitting to challenge)", precision=0)
	token_set_size = gr.Number(label="👶 Token set size (optional if not submitting to challenge)", precision=0)
	num_layers = gr.Number(label="👶 Number of layers (optional if not submitting to challenge)", precision=0)
	total_parameters = gr.Number(label="👶 Total number of parameters (optional if not submitting to challenge)", precision=0)
	flops = gr.Number(label="👶 Approximate number of training FLOPS (optional if not submitting to challenge)", precision=0)
	gpu_train = gr.Number(label="👶 Approximate GPU hours for training this model (optional if not submitting to challenge)", precision=0)
	data_human = gr.Dropdown(
	choices=[
	"Not applicable",
	"No",
	],
	label="👶 Custom data human annotation, type your choice if applicable. Example: We had humans provide preference data for model generated sentences.",
	value="Not applicable",
	allow_custom_value=True,
	filterable=True,
	interactive=True
	)
	data_aug = gr.Dropdown(
	choices=[
	"Not applicable",
	"No",
	],
	label="👶 Custom synthetic data or data augmentation, type your choice if applicable. Example: We used a pretrained T5 model to reword sentences from the original corpus.",
	value="Not applicable",
	allow_custom_value=True,
	filterable=True,
	interactive=True
	)

	description = gr.Textbox(label="👶 Brief textual description of the model (optional if not submitting to challenge)", placeholder="This is a baseline that uses the same hyperparameters as our competition entry but does not use the same curriculum learning approach.", lines=8)

	other_hyp = gr.File(label="🔹 Other hyperparameters (JSON file)", file_types=[".json"])

	submit_button = gr.Button("Submit Results")
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	[
	model_name_textbox,
	revision_name_textbox,
	hf_repo,
	track,
	results_file,
	model_type,
	approaches,
	base_model,
	learning_rate_scheduler,
	epochs,
	tokenizer,
	random_seed,
	num_heads,
	max_seq_len,
	gpu_dev,
	training_data,
	datasize,
	data_genre,
	learning_rate,
	optimizer,
	batch_size,
	token_set_size,
	num_layers,
	total_parameters,
	flops,
	gpu_train,
	data_human,
	data_preprocessing,
	data_aug,
	description,
	other_hyp,
	],
	submission_result)

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.launch(share=True, ssr_mode=False)