Spaces:
Runtime error
Runtime error
import copy | |
import datetime | |
import json | |
import os | |
from email.utils import parseaddr | |
import re | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from datasets import Dataset, DatasetDict, VerificationMode, get_dataset_config_names, load_dataset | |
from huggingface_hub import HfApi | |
from content import ( | |
CITATION_BUTTON_LABEL, | |
CITATION_BUTTON_TEXT, | |
INTRODUCTION_TEXT, | |
SUBMISSION_TEXT, | |
TITLE, | |
format_error, | |
format_log, | |
format_warning, | |
model_hyperlink, | |
) | |
TOKEN = os.environ.get("HF_TOKEN", None) | |
OWNER = "facebook" | |
## private datasets | |
SUBMISSION_DATASET = f"{OWNER}/pwm_leaderboard_submissions_internal" | |
CONTACT_DATASET = f"{OWNER}/pwm_leaderboard_contact_info_internal" | |
## public datasets | |
RESULTS_DATASET = f"{OWNER}/pwm_leaderboard_results_public" | |
LEADERBOARD_PATH = f"{OWNER}/pwm_leaderboard" | |
DATA_VERSION = "1.0.0" | |
# Dataset paths | |
MVP_DATASET = "facebook/minimal_video_pairs" | |
INTP_DATASET = "facebook/IntPhys2_test" | |
WMQA_DATASET = "facebook/CausalVQA" | |
# Dataset names | |
MVP_NAME = "MVPBench" | |
INTP_NAME = "IntPhys 2" | |
WMQA_NAME = "CausalVQA" | |
# Dataset keys | |
MVP_KEY = "mvp" | |
MVP_MINI_KEY = "mvp_mini" | |
INTP_KEY = "intphys2" | |
WMQA_KEY = "causalvqa" | |
TASKS = [ | |
(INTP_KEY, INTP_NAME), | |
(MVP_KEY, MVP_NAME), | |
(WMQA_KEY, WMQA_NAME), | |
] | |
VISIBLE_TASKS = copy.deepcopy(TASKS) | |
PRE_COL_NAMES = ["Model Name"] | |
POST_COL_NAMES = ["Model Type", "Vision Backbone", "LLM Backbone", "Submission Date"] | |
api = HfApi() | |
os.makedirs("scored", exist_ok=True) | |
LOCAL_DEBUG = False | |
# Display the results | |
LDB_TEXT_KEYS = ["model", "model_type", "vision_backbone", "llm_backbone"] | |
LDB_TEXT_TYPES = ["markdown", "text", "text", "text"] | |
MISSING_VALUE = -1.0 | |
HUMAN_BASELINES = { | |
"url": "", | |
"model": "Human", | |
"model_type": "Human", | |
"system_prompt": "test", | |
"vision_backbone": " - ", | |
"llm_backbone": " - ", | |
"num_frames": -1, | |
f"score_{INTP_KEY}": 92.44, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 92.9, | |
f"score_{WMQA_KEY}": 84.78, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
GEMINI2_5 = { | |
"url": "https://deepmind.google/models/gemini/flash/", | |
"model": "Gemini 2.5 Flash", | |
"model_type": "Closed", | |
"system_prompt": "test", | |
"vision_backbone": " - ", | |
"llm_backbone": " - ", | |
"num_frames": 10, | |
f"score_{INTP_KEY}": 56.1, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
f"score_{WMQA_KEY}": 61.66, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
GPT4O = { | |
"url": "https://openai.com/index/gpt-4o-system-card/", | |
"model": "GPT-4o", | |
"model_type": "Closed", | |
"system_prompt": "test", | |
"vision_backbone": " - ", | |
"llm_backbone": " - ", | |
"num_frames": 10, | |
f"score_{INTP_KEY}": 53.19, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 32.5, | |
f"score_{WMQA_KEY}": 50.95, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
INTERN_VL = { | |
"url": "https://internvl.github.io/blog/2024-12-05-InternVL-2.5/", | |
"model": "InternVL2.5", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "InternViT-300M", | |
"llm_backbone": "InternLM2.5-7B-Chat", | |
"num_frames": 16, | |
f"score_{INTP_KEY}": MISSING_VALUE, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 39.9, | |
f"score_{WMQA_KEY}": 47.54, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
LLAVA = { | |
"url": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov", | |
"model": "LLaVA-OneVision", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "SigLIP", | |
"llm_backbone": "Qwen2-7B", | |
"num_frames": 16, | |
f"score_{INTP_KEY}": MISSING_VALUE, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 20.7, | |
f"score_{WMQA_KEY}": 45.27, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
PLM = { | |
"url": "https://github.com/facebookresearch/perception_models", | |
"model": "Perception Language Model (PLM)", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "PE", | |
"llm_backbone": "Llama3.1 8B", | |
"num_frames": 16, | |
f"score_{INTP_KEY}": MISSING_VALUE, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 39.7, | |
f"score_{WMQA_KEY}": 50.06, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
QWENVL = { | |
"url": "https://github.com/QwenLM/Qwen2.5-VL", | |
"model": "Qwen2.5-VL", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "ViT", | |
"llm_backbone": "Qwen2.5-7B-Instruct", | |
"num_frames": 16, | |
f"score_{INTP_KEY}": 49.12, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 36.7, | |
f"score_{WMQA_KEY}": 49.05, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
GEMINI1_5 = { | |
"url": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/1-5-pro", | |
"model": "Gemini 1.5 Pro", | |
"model_type": "Closed", | |
"system_prompt": "test", | |
"vision_backbone": " - ", | |
"llm_backbone": " - ", | |
"num_frames": -1, | |
f"score_{INTP_KEY}": 52.1, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 29.6, | |
f"score_{WMQA_KEY}": MISSING_VALUE, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
VJEPA2 = { | |
"url": "https://ai.meta.com/vjepa/", | |
"model": "V-JEPA 2", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "VJEPA 2", | |
"llm_backbone": "Llama3.1 8B", | |
"num_frames": -1, | |
f"score_{INTP_KEY}": 56.4, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": 44.5, | |
f"score_{WMQA_KEY}": 38.99, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
COSMOS = { | |
"url": "https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-4B", | |
"model": "Cosmos-4B", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": " - ", | |
"llm_backbone": " - ", | |
"num_frames": -1, | |
f"score_{INTP_KEY}": 48.84, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
f"score_{WMQA_KEY}": MISSING_VALUE, | |
"date": "2025-06-11", | |
"organization": "Meta", | |
"submitted_by": "user", | |
} | |
def get_dataframe_from_results(eval_results, split): | |
local_df = eval_results[split] | |
local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) | |
local_df = local_df.remove_columns(["system_prompt"])#, "url"]) | |
df = pd.DataFrame(local_df) | |
# reformat the data to keep a single row for a given model and organization pair | |
# in case of multiple entries, choose the ones with latest values | |
df["model_org"] = df["model"].str.cat(df["organization"], sep="-") | |
ldb_m2r = {} | |
for i, row in df.iterrows(): | |
if row["model_org"] not in ldb_m2r: | |
ldb_m2r[row["model_org"]] = {} | |
prev_d = ldb_m2r[row["model_org"]] | |
new_d = {} | |
for key in LDB_TEXT_KEYS: | |
new_d[key] = row[key] if len(row[key]) > 0 else prev_d.get(key, "NA") | |
for tname, _ in TASKS: | |
new_d[f"score_{tname}"] = ( | |
row[f"score_{tname}"] if row[f"score_{tname}"] >= 0 else prev_d.get(f"score_{tname}", MISSING_VALUE) | |
) | |
if tname == "mvp": | |
new_d[f"score_mvp_mini"] = ( | |
row[f"score_mvp_mini"] | |
if row[f"score_mvp_mini"] >= 0 | |
else prev_d.get(f"score_mvp_mini", MISSING_VALUE) | |
) | |
new_d["date"] = row["date"] | |
ldb_m2r[row["model_org"]] = new_d | |
# add Human baseline | |
ldb_m2r["human"] = HUMAN_BASELINES | |
ldb_m2r["gemini2.5"] = GEMINI2_5 | |
ldb_m2r["gemini1.5"] = GEMINI1_5 | |
ldb_m2r["gpt4o"] = GPT4O | |
ldb_m2r["internvl"] = INTERN_VL | |
ldb_m2r["llavaov"] = LLAVA | |
ldb_m2r["plm"] = PLM | |
ldb_m2r["qwen2.5"] = QWENVL | |
ldb_m2r["vjepa2"] = VJEPA2 | |
ldb_m2r["cosmos"] = COSMOS | |
# compute average and convert back to rows | |
ldb_rows = [] | |
for key, val in ldb_m2r.items(): | |
print(ldb_m2r[key]) | |
if "url" in ldb_m2r[key].keys() and ldb_m2r[key]["url"] != "": | |
ldb_m2r[key]["model"] = model_hyperlink(ldb_m2r[key]["url"],ldb_m2r[key]["model"]) | |
row = copy.deepcopy(val) | |
score_keys = {k for k in val if k.startswith("score_")} | |
row["score"] = np.round(np.mean([row[sk] for sk in score_keys if (row[sk] != MISSING_VALUE and row[sk] != "-")]), 2) | |
tasks_completed = 0 | |
for sk in score_keys: | |
if row[sk] == MISSING_VALUE: | |
row[sk] = "-" | |
else: | |
tasks_completed += 1 | |
row["tasks_completed"] = tasks_completed | |
ldb_rows.append(row) | |
df = pd.DataFrame(ldb_rows) | |
df = df.query('date >= "2025-06-11"') | |
# df = df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) | |
# sort | |
df = df.sort_values(by=["tasks_completed", "score"], ascending=False) | |
# format numerics | |
numeric_cols = [c for c in df.columns if c.startswith("score_")] | |
for nc in numeric_cols: | |
df[nc] = df[nc].apply(lambda x: np.round(x, 2) if type(x) == float else x) | |
# remove columns and rename | |
df.drop(["tasks_completed"], axis=1, inplace=True) | |
col_mapper = {f"score_{tname}": f"{tdisplay} (%)" for tname, tdisplay in TASKS if tname != "mvp"} | |
col_mapper.update( | |
{ | |
"model": "Model Name", | |
"model_type": "Model Type", | |
"vision_backbone": "Vision Backbone", | |
"llm_backbone": "LLM Backbone", | |
#"score": "Average Score (%)", | |
"date": "Submission Date", | |
} | |
) | |
df.rename(col_mapper, axis=1, inplace=True) | |
df[f"{MVP_NAME} (%)"] = df.score_mvp_mini.astype(str) | |
df.drop([f"score_{MVP_KEY}", f"score_{MVP_MINI_KEY}"], axis=1, inplace=True) | |
# order columns | |
df = df[PRE_COL_NAMES + [f"{t[1]} (%)" for t in VISIBLE_TASKS] + POST_COL_NAMES] | |
return df | |
def create_dummy_data(): | |
# Dummy evals data | |
rows = [ | |
{ | |
"url": "https://deepmind.google/models/gemini/flash/", | |
"model": "Gemini Test", | |
"model_type": "Closed", | |
"system_prompt": "test", | |
"vision_backbone": " - ", | |
"llm_backbone": " - ", | |
"num_frames": 10, | |
f"score_{INTP_KEY}": 56.1, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
f"score_{WMQA_KEY}": 61.66, | |
"date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
"organization": "test", | |
"submitted_by": "octocat", | |
}, | |
{ | |
"url": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", | |
"model": "Llava 1.6", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "CLIP", | |
"llm_backbone": "Mistral", | |
"num_frames": 16, | |
f"score_{INTP_KEY}": MISSING_VALUE, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
f"score_{WMQA_KEY}": MISSING_VALUE, | |
"date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
"organization": "test", | |
"submitted_by": "octocat", | |
}, | |
{ | |
"url": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", | |
"model": "Llava 1.6", | |
"model_type": "Open", | |
"system_prompt": "test", | |
"vision_backbone": "CLIP", | |
"llm_backbone": "Mistral", | |
"num_frames": 16, | |
f"score_{INTP_KEY}": 0.0, | |
f"score_{MVP_KEY}": MISSING_VALUE, | |
f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
f"score_{WMQA_KEY}": 0.0, | |
"date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
"organization": "test", | |
"submitted_by": "octocat", | |
}, | |
] | |
dt = DatasetDict({"valid": Dataset.from_list(rows), "test": Dataset.from_list(rows)}) | |
# Dummy contact | |
contact_info = { | |
"model": "llama", | |
"url": "test", | |
"organization": "test", | |
"username": "test", | |
"mail": "test", | |
"date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
} | |
cdt = DatasetDict({"valid": Dataset.from_list([contact_info]), "test": Dataset.from_list([contact_info])}) | |
return dt, cdt | |
DUMMY_DATA = False | |
def get_eval_data(): | |
if DUMMY_DATA: | |
eval_results, _ = create_dummy_data() | |
else: | |
eval_results = load_dataset( | |
RESULTS_DATASET, | |
token=TOKEN, | |
download_mode="force_redownload", | |
verification_mode=VerificationMode.NO_CHECKS, | |
trust_remote_code=True, | |
) | |
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid") | |
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") | |
return eval_results, eval_dataframe_val, eval_dataframe_test | |
def restart_space(): | |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) | |
# --- MVP Functions __ | |
def validate_mvp(submission_df, split="valid"): | |
subsets = submission_df.data_name.unique() | |
for subset in subsets: | |
assert subset in [MVP_KEY, MVP_MINI_KEY], format_error( | |
f"Wrong tasks, got {subset} but expecting either mvp or mvp_mini" | |
) | |
gold_tasks = get_dataset_config_names(MVP_DATASET, token=TOKEN) | |
for subset in subsets: | |
tasks = submission_df[submission_df.data_name == subset].task.unique() | |
assert len(tasks) == len(gold_tasks), format_error( | |
f"{MVP_NAME} submission must have all tasks, found = {tasks}, expecting = {gold_tasks}" | |
) | |
for task in tasks: | |
sub_df = submission_df[(submission_df.data_name == subset) & (submission_df.task == task)].copy() | |
assert task in gold_tasks, format_error(f"Found unknown task {task} for {MVP_NAME}, check submission") | |
gold_dataset = load_dataset(MVP_DATASET, task, split="full" if subset == MVP_KEY else "mini", token=TOKEN) | |
assert len(sub_df) == len(gold_dataset), format_error( | |
f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} for task {task} in split {subset}" | |
) | |
id2answer = {row["video_id"]: row["answer"] for row in gold_dataset} | |
for i, r in sub_df.iterrows(): | |
assert r["row_id"] in id2answer, format_error( | |
f"Submission contains row_id {r['row_id']} which doesn't match the dataset's video_id" | |
) | |
def compute_scores_mvp(submission_df, split="valid"): | |
gold_tasks = get_dataset_config_names(MVP_DATASET, token=TOKEN) | |
subsets = submission_df.data_name.unique() | |
scored_subs = [] | |
for subset in subsets: | |
tasks = submission_df[submission_df.data_name == subset].task.unique() | |
assert len(tasks) == len(gold_tasks), format_error(f"{MVP_NAME} submission must have all tasks") | |
for task in tasks: | |
sub_df = submission_df[(submission_df.data_name == subset) & (submission_df.task == task)].copy() | |
gold_dataset = load_dataset(MVP_DATASET, task, split="full" if subset == MVP_KEY else "mini", token=TOKEN) | |
id2answer = {row["video_id"]: row["answer"] for row in gold_dataset} | |
correct = [] | |
for i, r in sub_df.iterrows(): | |
gold_answer = id2answer[r["row_id"]] | |
model_answer = r["model_answer"] | |
if gold_answer == model_answer: | |
correct.append(1) | |
else: | |
correct.append(0) | |
sub_df["rating"] = correct | |
scored_subs.append(sub_df) | |
return pd.concat(scored_subs) | |
def aggregate_scores_mvp(scored_submission_df, split="valid"): | |
subsets = scored_submission_df.data_name.unique() | |
subset_scores = {f"score_{s}": 0 for s in subsets} | |
for subset in subsets: | |
tasks = scored_submission_df[scored_submission_df.data_name == subset].task.unique() | |
task_pair_accuracies = [] | |
for task in tasks: | |
sub_df = scored_submission_df[ | |
(scored_submission_df.data_name == subset) & (scored_submission_df.task == task) | |
].copy() | |
result_by_vid = {} | |
pair_correct_count = 0 | |
for i, row in sub_df.iterrows(): | |
video_id = "_".join(row["row_id"].split("_")[:-1]) | |
if video_id not in result_by_vid: | |
result_by_vid[video_id] = [row.to_dict()] | |
else: | |
result_by_vid[video_id].append(row.to_dict()) | |
for video_id, answer_dict_pair in result_by_vid.items(): | |
answer_dict_1, answer_dict_2 = answer_dict_pair | |
if answer_dict_1["rating"] == 1 and answer_dict_2["rating"] == 1: | |
pair_correct_count += 1 | |
task_pair_accuracies.append((pair_correct_count / len(result_by_vid)) * 100) | |
# compute macro scores | |
subset_scores[f"score_{subset}"] = np.mean(task_pair_accuracies) | |
return subset_scores | |
# --- CausalVQA functions --- | |
def validate_causalvqa(submission_df, split="test"): | |
#assert split == "test", format_error(f"Split {split} not available for dataset {WMQA_NAME}") | |
split = "train" | |
subsets = submission_df.data_name.unique() | |
for subset in subsets: | |
assert subset in [WMQA_KEY], format_error( | |
f"Wrong tasks, got {subset} but expecting causalvqa" | |
) | |
gold_tasks = get_dataset_config_names(WMQA_DATASET, token=TOKEN) | |
for subset in subsets: | |
tasks = "default"#submission_df[submission_df.data_name == subset].task.unique() | |
sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
gold_dataset = load_dataset(WMQA_DATASET, "", split="train", token=TOKEN) #note, causalvqa only has a test dataset under hf split 'valid' | |
assert len(sub_df) == len(gold_dataset), format_error( | |
f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} for task {task} in split {subset}" | |
) | |
id2answer = {row["id"]+'_'+str(row["n"]): row["answer"] for row in gold_dataset} | |
for i, r in sub_df.iterrows(): | |
assert r["row_id"] in id2answer, format_error( | |
f"Submission contains row_id {r['row_id']} which doesn't match the dataset's qid" | |
) | |
print('validated') | |
def compute_scores_causalvqa(submission_df, split="test"): | |
#assert split == "test", format_error(f"Split {split} not available for dataset {WMQA_NAME}") | |
split = "train" | |
gold_tasks = get_dataset_config_names(WMQA_DATASET, token=TOKEN) | |
subsets = submission_df.data_name.unique() | |
scored_subs = [] | |
for subset in subsets: | |
sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
sub_df['model_answer'] = sub_df['model_answer'].str.replace(r'[^a-eA-E]', '', regex=True, flags=re.IGNORECASE).str.upper() | |
gold_dataset = load_dataset(WMQA_DATASET, "", split="train", token=TOKEN) | |
gold_dataset = gold_dataset.to_pandas() | |
gold_dataset['row_id'] = gold_dataset.apply(lambda x: x['id']+'_'+str(x['n']), axis=1) | |
joined = pd.merge(gold_dataset, sub_df, on='row_id', how='left') | |
correct = [] | |
for i, r in joined.iterrows(): | |
gold_answer = r['answer'] | |
model_answer = r["model_answer"] | |
if gold_answer == model_answer: | |
correct.append(1) | |
else: | |
correct.append(0) | |
joined["rating"] = correct | |
scored_subs.append(joined) | |
print(joined.columns) | |
print('scored') | |
return pd.concat(scored_subs) | |
def aggregate_scores_causalvqa(scored_submission_df, split="test"): | |
subsets = scored_submission_df.data_name.unique() | |
subset_scores = {f"score_{s}": 0 for s in subsets} | |
for subset in subsets: | |
sub_df = scored_submission_df[scored_submission_df.data_name == subset].copy() | |
agg_df = sub_df.groupby(['id','strata'])['rating'].sum().reset_index() | |
agg_df['points'] = 0 | |
agg_df.loc[agg_df['rating']==2, 'points'] = 1 | |
# compute macro scores | |
subset_scores[f"score_{subset}"] = agg_df.points.mean()*100.00 | |
print('aggregated') | |
return subset_scores | |
# --- IntPhys functions --- | |
def validate_intphys(submission_df, split="test"): | |
assert split == "test", format_error(f"Split {split} not available for dataset {INTP_NAME}") | |
subsets = submission_df.data_name.unique() | |
for subset in subsets: | |
assert subset in [INTP_KEY], format_error( | |
f"Wrong tasks, got {subset} but expecting " + INTP_KEY | |
) | |
gold_tasks = get_dataset_config_names(INTP_DATASET, token=TOKEN) | |
for subset in subsets: | |
sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
gold_dataset = load_dataset(INTP_DATASET, "", split="test") | |
assert len(sub_df) == len(gold_dataset), format_error( | |
f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} in split {subset}" | |
) | |
id2answer = {row["name"]: row["answer"] for row in gold_dataset} | |
for i, r in sub_df.iterrows(): | |
assert r["row_id"] in id2answer, format_error( | |
f"Submission contains row_id {r['row_id']} which doesn't match the dataset's video_id" | |
) | |
def compute_scores_intphys(submission_df, split="test"): | |
assert split == "test", format_error(f"Split {split} not available for dataset {INTP_NAME}") | |
gold_tasks = get_dataset_config_names(INTP_DATASET, token=TOKEN) | |
subsets = submission_df.data_name.unique() | |
scored_subs = [] | |
for subset in subsets: | |
sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
gold_dataset = load_dataset(INTP_DATASET, "", split="test", token=TOKEN) | |
id2answer = {row["name"]: row["answer"] for row in gold_dataset} | |
correct = [] | |
for i, r in sub_df.iterrows(): | |
gold_answer = id2answer[r["row_id"]] | |
model_answer = r["model_answer"] | |
if gold_answer == model_answer: | |
correct.append(1) | |
else: | |
correct.append(0) | |
sub_df["rating"] = correct | |
scored_subs.append(sub_df) | |
return pd.concat(scored_subs) | |
def aggregate_scores_intphys(scored_submission_df, split="test"): | |
subsets = scored_submission_df.data_name.unique() | |
subset_scores = {f"score_{s}": 0 for s in subsets} | |
accuracies = [] | |
for subset in subsets: | |
sub_df = scored_submission_df[ | |
(scored_submission_df.data_name == subset) | |
].copy() | |
result_by_vid = {} | |
pair_correct_count = 0 | |
for i, row in sub_df.iterrows(): | |
if row["rating"] == 1: | |
pair_correct_count += 1 | |
accuracies.append((pair_correct_count / len(sub_df)) * 100) | |
# compute macro scores | |
subset_scores[f"score_{subset}"] = np.mean(accuracies) | |
return subset_scores | |
VALIDATION_FN = { | |
MVP_KEY: validate_mvp, | |
MVP_MINI_KEY: validate_mvp, | |
INTP_KEY: validate_intphys, | |
WMQA_KEY: validate_causalvqa, | |
} | |
SCORER_FN = { | |
MVP_KEY: compute_scores_mvp, | |
MVP_MINI_KEY: compute_scores_mvp, | |
INTP_KEY: compute_scores_intphys, | |
WMQA_KEY: compute_scores_causalvqa, | |
} | |
AGGREGATE_FN = { | |
MVP_KEY: aggregate_scores_mvp, | |
MVP_MINI_KEY: aggregate_scores_mvp, | |
INTP_KEY: aggregate_scores_intphys, | |
WMQA_KEY: aggregate_scores_causalvqa, | |
} | |
def compute_scores(submission_df, split="valid"): | |
""" | |
Runs the scores with held out valid/test sets, and updates the submission with metrics for each dataset | |
- First, runs validation for the input to ensure the right keys are present | |
- Then, runs the evaluations | |
""" | |
tasks = submission_df.data_name.unique() | |
scored_subs = [] | |
for t in tasks: | |
task_sub = submission_df[submission_df.data_name == t].copy() | |
scored_subs.append(SCORER_FN[t](task_sub, split)) | |
scored_subs = pd.concat(scored_subs) | |
return scored_subs | |
def aggregate_scores(scored_df, split="valid"): | |
tasks = scored_df.data_name.unique() | |
agg_scores = {} | |
for task in tasks: | |
task_sub = scored_df[scored_df.data_name == task].copy() | |
agg_metrics = AGGREGATE_FN[task](task_sub, split=split) | |
agg_scores.update(agg_metrics) | |
return agg_scores | |
def validate_submission(submission_df, split="valid"): | |
""" | |
Validate user submissions | |
""" | |
# Run checks | |
assert "data_name" in submission_df.columns, format_error("Submission missing column data_name") | |
assert "row_id" in submission_df.columns, format_error("Submission missing column row_id") | |
assert "task" in submission_df.columns, format_error("Submission missing column task") | |
assert "model_answer" in submission_df.columns, format_error("Submission missing column model_answer") | |
tasks = submission_df.data_name.unique() | |
valid_tasks = [t[0] for t in TASKS] + [MVP_MINI_KEY] | |
for t in tasks: | |
assert t in valid_tasks, format_error( | |
f"Submission contains one or more rows with data_name={t}, which is not a valid task for this leaderboard (expecting to match a dataset in {valid_tasks})" | |
) | |
# Dataset specific checks | |
for task in tasks: | |
task_sub = submission_df[submission_df.data_name == task].copy() | |
VALIDATION_FN[task](task_sub) | |
def add_new_eval( | |
model: str, | |
vision_backbone: str, | |
llm_backbone: str, | |
url: str, | |
model_type: str, | |
path_to_file: str, | |
organization: str, | |
mail: str, | |
profile: gr.OAuthProfile, | |
progress=gr.Progress(), | |
): | |
progress(0, desc="Validating user ...") | |
contact_infos = load_dataset( | |
CONTACT_DATASET, | |
token=TOKEN, | |
download_mode="force_redownload", | |
verification_mode=VerificationMode.NO_CHECKS, | |
trust_remote_code=True, | |
) | |
user_submission_dates = sorted( | |
row["date"] for row in contact_infos["test"] if row["username"] == profile.username | |
) | |
# Logic to limit submissions per day | |
if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime("%Y-%m-%d"): | |
return format_error("You already submitted once today, please try again tomorrow.") | |
# Very basic email parsing | |
_, parsed_mail = parseaddr(mail) | |
if not "@" in parsed_mail: | |
return format_warning("Please provide a valid email adress.") | |
print("Adding new eval") | |
progress(0.1, desc="Fetching recent evals ...") | |
eval_results, _, _ = get_eval_data() | |
# # Check if the combination model/org already exists and prints a warning message if yes | |
# if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organization.lower() in set( | |
# [o.lower() for o in eval_results[val_or_test]["organization"]] | |
# ): | |
# return format_warning("This model has been already submitted.") | |
if path_to_file is None: | |
return format_warning("Please attach a file.") | |
# validate submission - do not save submission until its fully validated | |
progress(0.3, desc="Validating user submission ...") | |
file_path = path_to_file.name | |
assert file_path.endswith(".jsonl"), format_error("Please submit a jsonl file") | |
submissions_df = pd.read_json(file_path, lines=True, orient="records") | |
validate_submission(submissions_df) | |
# Save submitted file | |
if LOCAL_DEBUG: | |
gr.Info("In local debug mode, mock uploading submission dataset.") | |
else: | |
api.upload_file( | |
repo_id=SUBMISSION_DATASET, | |
path_or_fileobj=path_to_file.name, | |
path_in_repo=f"{organization}/{model}/submissions/test_raw_{datetime.datetime.today()}.jsonl", | |
repo_type="dataset", | |
token=TOKEN, | |
) | |
# Compute score | |
progress(0.5, desc="Computing scores ...") | |
scored_df = compute_scores(submissions_df, split="test") | |
# Save scored file | |
if LOCAL_DEBUG: | |
gr.Info("In local debug mode, mock uploading scored files") | |
else: | |
tasks = scored_df.data_name.unique() | |
for task in tasks: | |
scored_df.to_json(f"scored/{organization}_{model}_{task}.jsonl", lines=True, orient="records") | |
api.upload_file( | |
repo_id=SUBMISSION_DATASET, | |
path_or_fileobj=f"scored/{organization}_{model}_{task}.jsonl", | |
path_in_repo=f"{organization}/{model}/scored/{task}/test_scored_{datetime.datetime.today()}.jsonl", | |
repo_type="dataset", | |
token=TOKEN, | |
) | |
# Actual submission | |
progress(0.7, desc="Submitting leaderboard entry ...") | |
eval_entry = { | |
"model": model, | |
"model_type": model_type, | |
"vision_backbone": vision_backbone, | |
"llm_backbone": llm_backbone, | |
"url": url, | |
"organization": organization, | |
"submitted_by": profile.username, | |
"date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
} | |
agg_metrics = aggregate_scores(scored_df, split="test") | |
eval_entry.update(agg_metrics) | |
# update missing tasks to MISSING_VALUE | |
task_keys = [t[0] for t in TASKS] + [MVP_MINI_KEY] | |
missing_metrics = {f"score_{task}": MISSING_VALUE for task in task_keys if f"score_{task}" not in eval_entry} | |
eval_entry.update(missing_metrics) | |
eval_results["test"] = eval_results["test"].add_item(eval_entry) | |
if LOCAL_DEBUG: | |
print(eval_results["valid"][-1]) | |
gr.Info("In local debug mode, mock uploading aggregated scores") | |
else: | |
eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN) | |
progress(0.9, desc="Updating contacts ...") | |
contact_info = { | |
"model": model, | |
"url": url, | |
"organization": organization, | |
"username": profile.username, | |
"mail": mail, | |
"date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
} | |
contact_infos["test"] = contact_infos["test"].add_item(contact_info) | |
if LOCAL_DEBUG: | |
print("mock uploaded contact info") | |
else: | |
contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN) | |
progress(1.0, desc="Completed evaluation successfully. Please refresh leaderboard") | |
success_str = f"Model {model} submitted by {organization} is successfully evaluated and stored in our database.\nPlease wait a few hours and refresh the leaderboard to see your score displayed." | |
format_log(success_str) | |
return success_str | |
def on_filter_model_size_method_change(): | |
_, eval_dataframe_val, eval_dataframe_test = get_eval_data() | |
# eval_dataframe_val = eval_dataframe_val[PRE_COL_NAMES + [f"{t} (%)" for t in selected_columns] + POST_COL_NAMES] | |
eval_dataframe_test = eval_dataframe_test[PRE_COL_NAMES + [f"{t} (%)" for _,t in VISIBLE_TASKS] + POST_COL_NAMES] | |
datatypes = ["markdown"] + ["number" for _ in VISIBLE_TASKS] + ["text"] + ["text"] + ["text"] + ["date"] | |
# val_ldb = gr.components.Dataframe( | |
# value=eval_dataframe_val, datatype=datatypes, interactive=False, column_widths=["20%"] | |
# ) | |
test_ldb = gr.components.Dataframe( | |
value=eval_dataframe_test, datatype=datatypes, interactive=False, column_widths=["20%"] | |
) | |
return test_ldb | |
def upload_file(files): | |
file_paths = [file.name for file in files] | |
return file_paths | |
if __name__ == "__main__": | |
_, eval_dataframe_val, eval_dataframe_test = get_eval_data() | |
demo = gr.Blocks() | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.Row(): | |
with gr.Accordion("π Citation", open=False): | |
gr.Markdown(CITATION_BUTTON_LABEL) | |
gr.Markdown(CITATION_BUTTON_TEXT) | |
datatypes = ["markdown"] + ["number" for _ in VISIBLE_TASKS] + ["text"] + ["text"] + ["text"] + ["date"] | |
with gr.Tab("Results: Test"): | |
leaderboard_table_test = gr.components.Dataframe( | |
value=eval_dataframe_test, datatype=datatypes, interactive=False, column_widths=["20%"] | |
) | |
refresh_button = gr.Button("Refresh") | |
refresh_button.click( | |
# print(task_filter) | |
on_filter_model_size_method_change, | |
#inputs=[VISIBLE_TASKS], | |
#inputs=[], | |
outputs=[ | |
#leaderboard_table_val, | |
leaderboard_table_test, | |
], | |
) | |
with gr.Accordion("Submit a new model for evaluation"): | |
with gr.Row(): | |
gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") | |
with gr.Row(): | |
with gr.Column(): | |
# level_of_test = "test" | |
model_name_textbox = gr.Textbox(label="Model name") | |
model_url = gr.Textbox(label="Model URL") | |
model_type = gr.Dropdown(choices=["Open", "Closed"], label="Model Type") | |
# num_frames = gr.Textbox(label="Number of frames used") | |
llm_backbone_textbox = gr.Textbox(label="LLM Backbone") | |
vision_backbone_textbox = gr.Textbox(label="Vision Backbone") | |
# system_prompt_textbox = gr.Textbox(label="System prompt example") | |
# url_textbox = gr.Textbox(label="Url to model information") | |
with gr.Column(): | |
organization = gr.Textbox(label="Organization") | |
mail = gr.Textbox( | |
label="Contact email" | |
) | |
file_output = gr.File() | |
submission_result = gr.Textbox(label="Status") | |
with gr.Row(): | |
with gr.Column(): | |
gr.LoginButton() | |
with gr.Column(): | |
submit_button = gr.Button("Submit Eval") | |
submit_button.click( | |
add_new_eval, | |
[ | |
#level_of_test, | |
model_name_textbox, | |
vision_backbone_textbox, | |
llm_backbone_textbox, | |
model_url, | |
model_type, | |
# num_frames, | |
file_output, | |
organization, | |
mail, | |
], | |
submission_result, | |
) | |
scheduler = BackgroundScheduler() | |
scheduler.add_job(restart_space, "interval", seconds=3600) | |
scheduler.start() | |
demo.launch(debug=True) |