Spaces:
Running
Running
import os | |
# this is .py for store constants | |
MODEL_INFO = [ | |
"Model Name (clickable)", | |
"Sampled by", | |
"Evaluated by", | |
"Accessibility", | |
"Date", | |
"Total Score", | |
"Quality Score", | |
"Semantic Score", | |
"Selected Score", | |
] | |
MODEL_INFO_TAB_QUALITY = [ | |
"Model Name (clickable)", | |
"Quality Score", | |
"Selected Score" | |
] | |
MODEL_INFO_TAB_I2V = [ | |
"Model Name (clickable)", | |
"Sampled by", | |
"Evaluated by", | |
"Accessibility", | |
"Date", | |
"Total Score", | |
"I2V Score", | |
"Quality Score", | |
"Selected Score" | |
] | |
TASK_INFO = [ | |
"subject consistency", | |
"background consistency", | |
"temporal flickering", | |
"motion smoothness", | |
"dynamic degree", | |
"aesthetic quality", | |
"imaging quality", | |
"object class", | |
"multiple objects", | |
"human action", | |
"color", | |
"spatial relationship", | |
"scene", | |
"appearance style", | |
"temporal style", | |
"overall consistency" | |
] | |
DEFAULT_INFO = [ | |
"subject consistency", | |
"background consistency", | |
"temporal flickering", | |
"motion smoothness", | |
"dynamic degree", | |
"aesthetic quality", | |
"imaging quality", | |
"object class", | |
"multiple objects", | |
"human action", | |
"color", | |
"spatial relationship", | |
"scene", | |
"appearance style", | |
"temporal style", | |
"overall consistency" | |
] | |
QUALITY_LIST = [ | |
"subject consistency", | |
"background consistency", | |
"temporal flickering", | |
"motion smoothness", | |
"aesthetic quality", | |
"imaging quality", | |
"dynamic degree",] | |
SEMANTIC_LIST = [ | |
"object class", | |
"multiple objects", | |
"human action", | |
"color", | |
"spatial relationship", | |
"scene", | |
"appearance style", | |
"temporal style", | |
"overall consistency" | |
] | |
QUALITY_TAB = [ | |
"subject consistency", | |
"background consistency", | |
"motion smoothness", | |
"aesthetic quality", | |
"imaging quality", | |
"dynamic degree",] | |
I2V_LIST = [ | |
"Video-Text Camera Motion", | |
"Video-Image Subject Consistency", | |
"Video-Image Background Consistency", | |
] | |
I2V_QUALITY_LIST = [ | |
"Subject Consistency", | |
"Background Consistency", | |
"Motion Smoothness", | |
"Dynamic Degree", | |
"Aesthetic Quality", | |
"Imaging Quality", | |
# "Temporal Flickering" | |
] | |
I2V_TAB = [ | |
"Video-Text Camera Motion", | |
"Video-Image Subject Consistency", | |
"Video-Image Background Consistency", | |
"Subject Consistency", | |
"Background Consistency", | |
"Motion Smoothness", | |
"Dynamic Degree", | |
"Aesthetic Quality", | |
"Imaging Quality", | |
# "Temporal Flickering" | |
] | |
DIM_WEIGHT = { | |
"subject consistency":1, | |
"background consistency":1, | |
"temporal flickering":1, | |
"motion smoothness":1, | |
"aesthetic quality":1, | |
"imaging quality":1, | |
"dynamic degree":0.5, | |
"object class":1, | |
"multiple objects":1, | |
"human action":1, | |
"color":1, | |
"spatial relationship":1, | |
"scene":1, | |
"appearance style":1, | |
"temporal style":1, | |
"overall consistency":1 | |
} | |
DIM_WEIGHT_I2V = { | |
"Video-Text Camera Motion": 0.1, | |
"Video-Image Subject Consistency": 1, | |
"Video-Image Background Consistency": 1, | |
"Subject Consistency": 1, | |
"Background Consistency": 1, | |
"Motion Smoothness": 1, | |
"Dynamic Degree": 0.5, | |
"Aesthetic Quality": 1, | |
"Imaging Quality": 1, | |
"Temporal Flickering": 1 | |
} | |
SEMANTIC_WEIGHT = 1 | |
QUALITY_WEIGHT = 4 | |
I2V_WEIGHT = 1.0 | |
I2V_QUALITY_WEIGHT = 1.0 | |
DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] | |
I2V_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] | |
SUBMISSION_NAME = "vstar_leaderboard_submission" | |
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME) | |
CSV_DIR = "./vstar_leaderboard_submission/results.csv" | |
QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv" | |
I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv" | |
LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv" | |
INFO_DIR = "./vstar_leaderboard_submission/model_info.csv" | |
COLUMN_NAMES = MODEL_INFO + TASK_INFO | |
COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB | |
COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB | |
LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard | |
*"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"* | |
🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs* [](https://github.com/V-STaR-Bench/V-STaR) | |
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;"> | |
<a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a> | |
<a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a> | |
</div> | |
- **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”. | |
- **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR. | |
- **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains. | |
- **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning. | |
**Join Leaderboard**: Please contact us to update your results. | |
**Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors](). | |
""" | |
SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction | |
## 🎈 | |
⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting. | |
⚠️ Then, please contact us to update your results via [email1](mailto:[email protected]) or [email2](mailto:[email protected]). | |
""" | |
TABLE_INTRODUCTION = """ | |
""" | |
LEADERBORAD_INFO = """ | |
V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs. | |
""" | |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo, | |
title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning}, | |
author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong}, | |
year={2025}, | |
eprint={2503.11495}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CV}, | |
url={https://arxiv.org/abs/2503.11495}, | |
}""" | |
QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided." | |
I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`." | |
LONG_CLAIM_TEXT = "" | |
NORMALIZE_DIC = { | |
"subject consistency": {"Min": 0.1462, "Max": 1.0}, | |
"background consistency": {"Min": 0.2615, "Max": 1.0}, | |
"temporal flickering": {"Min": 0.6293, "Max": 1.0}, | |
"motion smoothness": {"Min": 0.706, "Max": 0.9975}, | |
"dynamic degree": {"Min": 0.0, "Max": 1.0}, | |
"aesthetic quality": {"Min": 0.0, "Max": 1.0}, | |
"imaging quality": {"Min": 0.0, "Max": 1.0}, | |
"object class": {"Min": 0.0, "Max": 1.0}, | |
"multiple objects": {"Min": 0.0, "Max": 1.0}, | |
"human action": {"Min": 0.0, "Max": 1.0}, | |
"color": {"Min": 0.0, "Max": 1.0}, | |
"spatial relationship": {"Min": 0.0, "Max": 1.0}, | |
"scene": {"Min": 0.0, "Max": 0.8222}, | |
"appearance style": {"Min": 0.0009, "Max": 0.2855}, | |
"temporal style": {"Min": 0.0, "Max": 0.364}, | |
"overall consistency": {"Min": 0.0, "Max": 0.364} | |
} | |
NORMALIZE_DIC_I2V = { | |
"Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 }, | |
"Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0}, | |
"Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 }, | |
"Subject Consistency":{"Min": 0.1462, "Max": 1.0}, | |
"Background Consistency":{"Min": 0.2615, "Max": 1.0 }, | |
"Motion Smoothness":{"Min": 0.7060, "Max": 0.9975}, | |
"Dynamic Degree":{"Min": 0.0, "Max": 1.0}, | |
"Aesthetic Quality":{"Min": 0.0, "Max": 1.0}, | |
"Imaging Quality":{"Min": 0.0, "Max": 1.0}, | |
"Temporal Flickering":{"Min":0.6293, "Max": 1.0} | |
} | |