from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- #TODO 指标 class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("Score_avg", "score", "Score_Avg ⬆️") task1 = Task("Score_gpt", "score", "Score_GPT") task2 = Task("Score_cog", "score", "Score_COG") task3 = Task("Score_cpm", "score", "Score_CPM") task4 = Task("Length_Avg", "scoreL", "Length_Avg") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- #TODO title TITLE = """

📈 CapArena-Auto Leaderboard 📊

""" # introduction text def get_INTRODUCTION_TEXT(model_num: int, LAST_UPDATED: str, paper_link="TODO"): return f"""
📑 Paper | MODELS: {model_num} | UPDATED: {"3-15"}
""" #TODO INTRODUCE_BENCHMARK = f"""
💬 Metric Explanations

CapArena-Auto is an arena-style automated evaluation benchmark for detailed captioning. It includes 600 evaluation images and assesses model performance through pairwise battles with three baseline models. The final score is calculated by GPT4o-as-a-Judge.

""" #TODO About LLM_BENCHMARKS_TEXT = f"""

See details in CapArena

""" EVALUATION_QUEUE_TEXT = """ """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" """