V-STaR-LeaderBoard / constants.py
V-STaR's picture
Update constants.py
852662b verified
import os
# this is .py for store constants
MODEL_INFO = [
"Model Name (clickable)",
"Sampled by",
"Evaluated by",
"Accessibility",
"Date",
"Total Score",
"Quality Score",
"Semantic Score",
"Selected Score",
]
MODEL_INFO_TAB_QUALITY = [
"Model Name (clickable)",
"Quality Score",
"Selected Score"
]
MODEL_INFO_TAB_I2V = [
"Model Name (clickable)",
"Sampled by",
"Evaluated by",
"Accessibility",
"Date",
"Total Score",
"I2V Score",
"Quality Score",
"Selected Score"
]
TASK_INFO = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"dynamic degree",
"aesthetic quality",
"imaging quality",
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency"
]
DEFAULT_INFO = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"dynamic degree",
"aesthetic quality",
"imaging quality",
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency"
]
QUALITY_LIST = [
"subject consistency",
"background consistency",
"temporal flickering",
"motion smoothness",
"aesthetic quality",
"imaging quality",
"dynamic degree",]
SEMANTIC_LIST = [
"object class",
"multiple objects",
"human action",
"color",
"spatial relationship",
"scene",
"appearance style",
"temporal style",
"overall consistency"
]
QUALITY_TAB = [
"subject consistency",
"background consistency",
"motion smoothness",
"aesthetic quality",
"imaging quality",
"dynamic degree",]
I2V_LIST = [
"Video-Text Camera Motion",
"Video-Image Subject Consistency",
"Video-Image Background Consistency",
]
I2V_QUALITY_LIST = [
"Subject Consistency",
"Background Consistency",
"Motion Smoothness",
"Dynamic Degree",
"Aesthetic Quality",
"Imaging Quality",
# "Temporal Flickering"
]
I2V_TAB = [
"Video-Text Camera Motion",
"Video-Image Subject Consistency",
"Video-Image Background Consistency",
"Subject Consistency",
"Background Consistency",
"Motion Smoothness",
"Dynamic Degree",
"Aesthetic Quality",
"Imaging Quality",
# "Temporal Flickering"
]
DIM_WEIGHT = {
"subject consistency":1,
"background consistency":1,
"temporal flickering":1,
"motion smoothness":1,
"aesthetic quality":1,
"imaging quality":1,
"dynamic degree":0.5,
"object class":1,
"multiple objects":1,
"human action":1,
"color":1,
"spatial relationship":1,
"scene":1,
"appearance style":1,
"temporal style":1,
"overall consistency":1
}
DIM_WEIGHT_I2V = {
"Video-Text Camera Motion": 0.1,
"Video-Image Subject Consistency": 1,
"Video-Image Background Consistency": 1,
"Subject Consistency": 1,
"Background Consistency": 1,
"Motion Smoothness": 1,
"Dynamic Degree": 0.5,
"Aesthetic Quality": 1,
"Imaging Quality": 1,
"Temporal Flickering": 1
}
SEMANTIC_WEIGHT = 1
QUALITY_WEIGHT = 4
I2V_WEIGHT = 1.0
I2V_QUALITY_WEIGHT = 1.0
DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
I2V_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
SUBMISSION_NAME = "vstar_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME)
CSV_DIR = "./vstar_leaderboard_submission/results.csv"
QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv"
I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv"
LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv"
INFO_DIR = "./vstar_leaderboard_submission/model_info.csv"
COLUMN_NAMES = MODEL_INFO + TASK_INFO
COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB
COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB
LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard
*"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"*
🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs* [![Code](https://img.shields.io/github/stars/V-STaR-Bench/V-STaR.svg?style=social&label=Official)](https://github.com/V-STaR-Bench/V-STaR)
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
<a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
<a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a>
</div>
- **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”.
- **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR.
- **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains.
- **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning.
**Join Leaderboard**: Please contact us to update your results.
**Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors]().
"""
SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction
## 🎈
⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting.
⚠️ Then, please contact us to update your results via [email1](mailto:[email protected]) or [email2](mailto:[email protected]).
"""
TABLE_INTRODUCTION = """
"""
LEADERBORAD_INFO = """
V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo,
title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning},
author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong},
year={2025},
eprint={2503.11495},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2503.11495},
}"""
QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided."
I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`."
LONG_CLAIM_TEXT = ""
NORMALIZE_DIC = {
"subject consistency": {"Min": 0.1462, "Max": 1.0},
"background consistency": {"Min": 0.2615, "Max": 1.0},
"temporal flickering": {"Min": 0.6293, "Max": 1.0},
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
"dynamic degree": {"Min": 0.0, "Max": 1.0},
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
"imaging quality": {"Min": 0.0, "Max": 1.0},
"object class": {"Min": 0.0, "Max": 1.0},
"multiple objects": {"Min": 0.0, "Max": 1.0},
"human action": {"Min": 0.0, "Max": 1.0},
"color": {"Min": 0.0, "Max": 1.0},
"spatial relationship": {"Min": 0.0, "Max": 1.0},
"scene": {"Min": 0.0, "Max": 0.8222},
"appearance style": {"Min": 0.0009, "Max": 0.2855},
"temporal style": {"Min": 0.0, "Max": 0.364},
"overall consistency": {"Min": 0.0, "Max": 0.364}
}
NORMALIZE_DIC_I2V = {
"Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 },
"Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0},
"Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 },
"Subject Consistency":{"Min": 0.1462, "Max": 1.0},
"Background Consistency":{"Min": 0.2615, "Max": 1.0 },
"Motion Smoothness":{"Min": 0.7060, "Max": 0.9975},
"Dynamic Degree":{"Min": 0.0, "Max": 1.0},
"Aesthetic Quality":{"Min": 0.0, "Max": 1.0},
"Imaging Quality":{"Min": 0.0, "Max": 1.0},
"Temporal Flickering":{"Min":0.6293, "Max": 1.0}
}