Spaces:

V-STaR-Bench
/

V-STaR-LeaderBoard

Running

App Files Files Community

V-STaR-LeaderBoard / constants.py

V-STaR

Update constants.py

852662b verified 15 days ago

raw

history blame contribute delete

9.89 kB

	import os
	# this is .py for store constants
	MODEL_INFO = [
	"Model Name (clickable)",
	"Sampled by",
	"Evaluated by",
	"Accessibility",
	"Date",
	"Total Score",
	"Quality Score",
	"Semantic Score",
	"Selected Score",
	]

	MODEL_INFO_TAB_QUALITY = [
	"Model Name (clickable)",
	"Quality Score",
	"Selected Score"
	]

	MODEL_INFO_TAB_I2V = [
	"Model Name (clickable)",
	"Sampled by",
	"Evaluated by",
	"Accessibility",
	"Date",
	"Total Score",
	"I2V Score",
	"Quality Score",
	"Selected Score"
	]

	TASK_INFO = [
	"subject consistency",
	"background consistency",
	"temporal flickering",
	"motion smoothness",
	"dynamic degree",
	"aesthetic quality",
	"imaging quality",
	"object class",
	"multiple objects",
	"human action",
	"color",
	"spatial relationship",
	"scene",
	"appearance style",
	"temporal style",
	"overall consistency"
	]

	DEFAULT_INFO = [
	"subject consistency",
	"background consistency",
	"temporal flickering",
	"motion smoothness",
	"dynamic degree",
	"aesthetic quality",
	"imaging quality",
	"object class",
	"multiple objects",
	"human action",
	"color",
	"spatial relationship",
	"scene",
	"appearance style",
	"temporal style",
	"overall consistency"
	]

	QUALITY_LIST = [
	"subject consistency",
	"background consistency",
	"temporal flickering",
	"motion smoothness",
	"aesthetic quality",
	"imaging quality",
	"dynamic degree",]

	SEMANTIC_LIST = [
	"object class",
	"multiple objects",
	"human action",
	"color",
	"spatial relationship",
	"scene",
	"appearance style",
	"temporal style",
	"overall consistency"
	]

	QUALITY_TAB = [
	"subject consistency",
	"background consistency",
	"motion smoothness",
	"aesthetic quality",
	"imaging quality",
	"dynamic degree",]

	I2V_LIST = [
	"Video-Text Camera Motion",
	"Video-Image Subject Consistency",
	"Video-Image Background Consistency",
	]

	I2V_QUALITY_LIST = [
	"Subject Consistency",
	"Background Consistency",
	"Motion Smoothness",
	"Dynamic Degree",
	"Aesthetic Quality",
	"Imaging Quality",
	# "Temporal Flickering"
	]

	I2V_TAB = [
	"Video-Text Camera Motion",
	"Video-Image Subject Consistency",
	"Video-Image Background Consistency",
	"Subject Consistency",
	"Background Consistency",
	"Motion Smoothness",
	"Dynamic Degree",
	"Aesthetic Quality",
	"Imaging Quality",
	# "Temporal Flickering"
	]

	DIM_WEIGHT = {
	"subject consistency":1,
	"background consistency":1,
	"temporal flickering":1,
	"motion smoothness":1,
	"aesthetic quality":1,
	"imaging quality":1,
	"dynamic degree":0.5,
	"object class":1,
	"multiple objects":1,
	"human action":1,
	"color":1,
	"spatial relationship":1,
	"scene":1,
	"appearance style":1,
	"temporal style":1,
	"overall consistency":1
	}

	DIM_WEIGHT_I2V = {
	"Video-Text Camera Motion": 0.1,
	"Video-Image Subject Consistency": 1,
	"Video-Image Background Consistency": 1,
	"Subject Consistency": 1,
	"Background Consistency": 1,
	"Motion Smoothness": 1,
	"Dynamic Degree": 0.5,
	"Aesthetic Quality": 1,
	"Imaging Quality": 1,
	"Temporal Flickering": 1
	}

	SEMANTIC_WEIGHT = 1
	QUALITY_WEIGHT = 4
	I2V_WEIGHT = 1.0
	I2V_QUALITY_WEIGHT = 1.0

	DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
	I2V_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

	SUBMISSION_NAME = "vstar_leaderboard_submission"
	SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME)
	CSV_DIR = "./vstar_leaderboard_submission/results.csv"
	QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv"
	I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv"
	LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv"
	INFO_DIR = "./vstar_leaderboard_submission/model_info.csv"

	COLUMN_NAMES = MODEL_INFO + TASK_INFO
	COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB
	COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB

	LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard

	"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"
	🏆 Welcome to the leaderboard of the V-STaR! 🎦 A spatio-temporal reasoning benchmark for Video-LLMs [![Code](https://img.shields.io/github/stars/V-STaR-Bench/V-STaR.svg?style=social&label=Official)](https://github.com/V-STaR-Bench/V-STaR)
	<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
	<a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
	<a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a>
	</div>

	- Comprehensive Dimensions: We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”.
	- Human Alignment: We conducted extensive experiments and human annotations to validate robustness of V-STaR.
	- New Metrics: We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains.
	- Valuable Insights: V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning.

	Join Leaderboard: Please contact us to update your results.

	Credits: This leaderboard is updated and maintained by the team of [V-STaR Contributors]().
	"""

	SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction
	## 🎈
	⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting.

	⚠️ Then, please contact us to update your results via [email1](mailto:[email protected]) or [email2](mailto:[email protected]).
	"""

	TABLE_INTRODUCTION = """
	"""

	LEADERBORAD_INFO = """
	V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs.
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo,
	title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning},
	author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong},
	year={2025},
	eprint={2503.11495},
	archivePrefix={arXiv},
	primaryClass={cs.CV},
	url={https://arxiv.org/abs/2503.11495},
	}"""

	QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided."

	I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`."

	LONG_CLAIM_TEXT = ""

	NORMALIZE_DIC = {
	"subject consistency": {"Min": 0.1462, "Max": 1.0},
	"background consistency": {"Min": 0.2615, "Max": 1.0},
	"temporal flickering": {"Min": 0.6293, "Max": 1.0},
	"motion smoothness": {"Min": 0.706, "Max": 0.9975},
	"dynamic degree": {"Min": 0.0, "Max": 1.0},
	"aesthetic quality": {"Min": 0.0, "Max": 1.0},
	"imaging quality": {"Min": 0.0, "Max": 1.0},
	"object class": {"Min": 0.0, "Max": 1.0},
	"multiple objects": {"Min": 0.0, "Max": 1.0},
	"human action": {"Min": 0.0, "Max": 1.0},
	"color": {"Min": 0.0, "Max": 1.0},
	"spatial relationship": {"Min": 0.0, "Max": 1.0},
	"scene": {"Min": 0.0, "Max": 0.8222},
	"appearance style": {"Min": 0.0009, "Max": 0.2855},
	"temporal style": {"Min": 0.0, "Max": 0.364},
	"overall consistency": {"Min": 0.0, "Max": 0.364}
	}

	NORMALIZE_DIC_I2V = {
	"Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 },
	"Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0},
	"Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 },
	"Subject Consistency":{"Min": 0.1462, "Max": 1.0},
	"Background Consistency":{"Min": 0.2615, "Max": 1.0 },
	"Motion Smoothness":{"Min": 0.7060, "Max": 0.9975},
	"Dynamic Degree":{"Min": 0.0, "Max": 1.0},
	"Aesthetic Quality":{"Min": 0.0, "Max": 1.0},
	"Imaging Quality":{"Min": 0.0, "Max": 1.0},
	"Temporal Flickering":{"Min":0.6293, "Max": 1.0}
	}