Vchitect/ShotVL-7B · Hugging Face

Model description

This model is a fine-tuned version of Qwen/Qwen2.5-VL-7B-Instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. It currently achieves state-of-the-art performance on ShotBench, a comprehensive benchmark for evaluating cinematography understanding in vision-language models. Please visit our paper for more details.

Demo Code

Image

import cv2
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

device = "cuda"
device_map = "balanced"
dtype = torch.bfloat16
image_path = "/path/to/image.jpg"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
  "Vchitect/ShotVL-7B",
  device_map=device_map,
  attn_implementation="flash_attention_2",
  torch_dtype=dtype,
).eval()
processor = AutoProcessor.from_pretrained(
  "Vchitect/ShotVL-7B", revision="refs/pr/24", use_fast=True, torch_dtype=dtype
)

msgs = [
  {"role": "system", "content": "You are a helpful assistant."},
  {
    "role": "user",
    "content": [
      {"type": "image", "image": image_path},
      {"type": "text", "text": "What's the shot size of this shot?"},
    ],
  },
]

text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(msgs)
inputs = processor(
  text=[text],
  images=image_inputs,
  videos=video_inputs,
  padding=True,
  return_tensors="pt",
).to(device)

with torch.inference_mode():
  out_ids = model.generate(**inputs, max_new_tokens=640)
  
trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, out_ids)]
print(processor.batch_decode(trimmed, skip_special_tokens=True)[0])

Video

import cv2
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

device = "cuda"
device_map = "balanced"
dtype = torch.bfloat16
video_path = "/path/to/video.mp4"

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
  "Vchitect/ShotVL-7B",
  device_map=device_map,
  attn_implementation="flash_attention_2",
  torch_dtype=dtype,
).eval()
processor = AutoProcessor.from_pretrained(
  "Vchitect/ShotVL-7B", revision="refs/pr/24", use_fast=True, torch_dtype=dtype
)

question = (
    "What's the camera movement in this movie shot?\n"
    "Options:\nA. Boom down\nB. Boom up\nC. Push in\nD. Pull out\n"
    "Please select the most likely answer from the options above.\n"
)
msgs = [
  {"role": "system", "content": "You are a helpful assistant."},
  {
    "role": "user",
    "content": [
      {"type": "video", "video": video_path, "max_pixels": 360*640, "fps": 12.0},
      {"type": "text", "text": question},
    ],
  },
]

text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(msgs)
inputs = processor(
  text=[text],
  images=image_inputs,
  videos=video_inputs,
  padding=True,
  return_tensors="pt",
).to(device)

with torch.inference_mode():
  out_ids = model.generate(**inputs, max_new_tokens=640)
  
trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, out_ids)]
print(processor.batch_decode(trimmed, skip_special_tokens=True)[0])

Vchitect
/

ShotVL-7B

Model description

Demo Code

Model tree for Vchitect/ShotVL-7B

Space using Vchitect/ShotVL-7B 1

Collection including Vchitect/ShotVL-7B

Shot-VL