import os # this is .py for store constants MODEL_INFO = [ "Model Name (clickable)", "Sampled by", "Evaluated by", "Accessibility", "Date", "Total Score", "Quality Score", "Semantic Score", "Selected Score", ] MODEL_INFO_TAB_QUALITY = [ "Model Name (clickable)", "Quality Score", "Selected Score" ] MODEL_INFO_TAB_I2V = [ "Model Name (clickable)", "Sampled by", "Evaluated by", "Accessibility", "Date", "Total Score", "I2V Score", "Quality Score", "Selected Score" ] TASK_INFO = [ "subject consistency", "background consistency", "temporal flickering", "motion smoothness", "dynamic degree", "aesthetic quality", "imaging quality", "object class", "multiple objects", "human action", "color", "spatial relationship", "scene", "appearance style", "temporal style", "overall consistency" ] DEFAULT_INFO = [ "subject consistency", "background consistency", "temporal flickering", "motion smoothness", "dynamic degree", "aesthetic quality", "imaging quality", "object class", "multiple objects", "human action", "color", "spatial relationship", "scene", "appearance style", "temporal style", "overall consistency" ] QUALITY_LIST = [ "subject consistency", "background consistency", "temporal flickering", "motion smoothness", "aesthetic quality", "imaging quality", "dynamic degree",] SEMANTIC_LIST = [ "object class", "multiple objects", "human action", "color", "spatial relationship", "scene", "appearance style", "temporal style", "overall consistency" ] QUALITY_TAB = [ "subject consistency", "background consistency", "motion smoothness", "aesthetic quality", "imaging quality", "dynamic degree",] I2V_LIST = [ "Video-Text Camera Motion", "Video-Image Subject Consistency", "Video-Image Background Consistency", ] I2V_QUALITY_LIST = [ "Subject Consistency", "Background Consistency", "Motion Smoothness", "Dynamic Degree", "Aesthetic Quality", "Imaging Quality", # "Temporal Flickering" ] I2V_TAB = [ "Video-Text Camera Motion", "Video-Image Subject Consistency", "Video-Image Background Consistency", "Subject Consistency", "Background Consistency", "Motion Smoothness", "Dynamic Degree", "Aesthetic Quality", "Imaging Quality", # "Temporal Flickering" ] DIM_WEIGHT = { "subject consistency":1, "background consistency":1, "temporal flickering":1, "motion smoothness":1, "aesthetic quality":1, "imaging quality":1, "dynamic degree":0.5, "object class":1, "multiple objects":1, "human action":1, "color":1, "spatial relationship":1, "scene":1, "appearance style":1, "temporal style":1, "overall consistency":1 } DIM_WEIGHT_I2V = { "Video-Text Camera Motion": 0.1, "Video-Image Subject Consistency": 1, "Video-Image Background Consistency": 1, "Subject Consistency": 1, "Background Consistency": 1, "Motion Smoothness": 1, "Dynamic Degree": 0.5, "Aesthetic Quality": 1, "Imaging Quality": 1, "Temporal Flickering": 1 } SEMANTIC_WEIGHT = 1 QUALITY_WEIGHT = 4 I2V_WEIGHT = 1.0 I2V_QUALITY_WEIGHT = 1.0 DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] I2V_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] SUBMISSION_NAME = "vstar_leaderboard_submission" SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME) CSV_DIR = "./vstar_leaderboard_submission/results.csv" QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv" I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv" LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv" INFO_DIR = "./vstar_leaderboard_submission/model_info.csv" COLUMN_NAMES = MODEL_INFO + TASK_INFO COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard *"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"* 🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs* [](https://github.com/V-STaR-Bench/V-STaR)
- **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”. - **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR. - **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains. - **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning. **Join Leaderboard**: Please contact us to update your results. **Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors](). """ SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction ## 🎈 ⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting. ⚠️ Then, please contact us to update your results via [email1](mailto:zixu.cheng@qmul.ac.uk) or [email2](mailto:hu.jian@qmul.ac.uk). """ TABLE_INTRODUCTION = """ """ LEADERBORAD_INFO = """ V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo, title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning}, author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong}, year={2025}, eprint={2503.11495}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2503.11495}, }""" QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided." I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`." LONG_CLAIM_TEXT = "" NORMALIZE_DIC = { "subject consistency": {"Min": 0.1462, "Max": 1.0}, "background consistency": {"Min": 0.2615, "Max": 1.0}, "temporal flickering": {"Min": 0.6293, "Max": 1.0}, "motion smoothness": {"Min": 0.706, "Max": 0.9975}, "dynamic degree": {"Min": 0.0, "Max": 1.0}, "aesthetic quality": {"Min": 0.0, "Max": 1.0}, "imaging quality": {"Min": 0.0, "Max": 1.0}, "object class": {"Min": 0.0, "Max": 1.0}, "multiple objects": {"Min": 0.0, "Max": 1.0}, "human action": {"Min": 0.0, "Max": 1.0}, "color": {"Min": 0.0, "Max": 1.0}, "spatial relationship": {"Min": 0.0, "Max": 1.0}, "scene": {"Min": 0.0, "Max": 0.8222}, "appearance style": {"Min": 0.0009, "Max": 0.2855}, "temporal style": {"Min": 0.0, "Max": 0.364}, "overall consistency": {"Min": 0.0, "Max": 0.364} } NORMALIZE_DIC_I2V = { "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 }, "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0}, "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 }, "Subject Consistency":{"Min": 0.1462, "Max": 1.0}, "Background Consistency":{"Min": 0.2615, "Max": 1.0 }, "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975}, "Dynamic Degree":{"Min": 0.0, "Max": 1.0}, "Aesthetic Quality":{"Min": 0.0, "Max": 1.0}, "Imaging Quality":{"Min": 0.0, "Max": 1.0}, "Temporal Flickering":{"Min":0.6293, "Max": 1.0} }