Shot-VL
Collection
3 items
โข
Updated
โข
1
This model is a fine-tuned version of Qwen/Qwen2.5-VL-7B-Instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. It currently achieves state-of-the-art performance on ShotBench, a comprehensive benchmark for evaluating cinematography understanding in vision-language models. Please visit our paper for more details.
Image
import cv2
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
device = "cuda"
device_map = "balanced"
dtype = torch.bfloat16
image_path = "/path/to/image.jpg"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Vchitect/ShotVL-7B",
device_map=device_map,
attn_implementation="flash_attention_2",
torch_dtype=dtype,
).eval()
processor = AutoProcessor.from_pretrained(
"Vchitect/ShotVL-7B", revision="refs/pr/24", use_fast=True, torch_dtype=dtype
)
msgs = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{"type": "image", "image": image_path},
{"type": "text", "text": "What's the shot size of this shot?"},
],
},
]
text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(msgs)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(device)
with torch.inference_mode():
out_ids = model.generate(**inputs, max_new_tokens=640)
trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, out_ids)]
print(processor.batch_decode(trimmed, skip_special_tokens=True)[0])
Video
import cv2
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
device = "cuda"
device_map = "balanced"
dtype = torch.bfloat16
video_path = "/path/to/video.mp4"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Vchitect/ShotVL-7B",
device_map=device_map,
attn_implementation="flash_attention_2",
torch_dtype=dtype,
).eval()
processor = AutoProcessor.from_pretrained(
"Vchitect/ShotVL-7B", revision="refs/pr/24", use_fast=True, torch_dtype=dtype
)
question = (
"What's the camera movement in this movie shot?\n"
"Options:\nA. Boom down\nB. Boom up\nC. Push in\nD. Pull out\n"
"Please select the most likely answer from the options above.\n"
)
msgs = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": [
{"type": "video", "video": video_path, "max_pixels": 360*640, "fps": 12.0},
{"type": "text", "text": question},
],
},
]
text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(msgs)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(device)
with torch.inference_mode():
out_ids = model.generate(**inputs, max_new_tokens=640)
trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, out_ids)]
print(processor.batch_decode(trimmed, skip_special_tokens=True)[0])