from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- #TODO 指标 class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("Score_avg", "score", "Score_Avg ⬆️") task1 = Task("Score_gpt", "score", "Score_GPT") task2 = Task("Score_cog", "score", "Score_COG") task3 = Task("Score_cpm", "score", "Score_CPM") task4 = Task("Length_Avg", "scoreL", "Length_Avg") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- #TODO title TITLE = """
CapArena-Auto is an arena-style automated evaluation benchmark for detailed captioning. It includes 600 evaluation images and assesses model performance through pairwise battles with three baseline models. The final score is calculated by GPT4o-as-a-Judge.