RoFurukawa commited on
Commit
cc6f8e5
·
verified ·
1 Parent(s): 39225be

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time, torch, gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText
3
+
4
+ MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
5
+
6
+ # Pick a safe float dtype for your GPU (Ampere+ -> bf16; else fp16; CPU -> fp32)
7
+ if torch.cuda.is_available():
8
+ major, _ = torch.cuda.get_device_capability()
9
+ FLOAT_DTYPE = torch.bfloat16 if major >= 8 else torch.float16
10
+ else:
11
+ FLOAT_DTYPE = torch.float32
12
+
13
+ # Load once (faster subsequent runs)
14
+ model = AutoModelForImageTextToText.from_pretrained(
15
+ MODEL_ID, torch_dtype=FLOAT_DTYPE, device_map="auto"
16
+ )
17
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
18
+
19
+ def run_video(video_path, prompt, max_new_tokens=256, backend="decord", num_frames=32):
20
+ """video_path is a local file path; backend in {'decord','pyav','opencv','torchvision'}"""
21
+ messages = [{
22
+ "role": "user",
23
+ "content": [
24
+ {"type": "video", "path": video_path},
25
+ {"type": "text", "text": prompt},
26
+ ],
27
+ }]
28
+
29
+ inputs = processor.apply_chat_template(
30
+ messages,
31
+ add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
32
+ video_load_backend=backend, num_frames=num_frames
33
+ )
34
+
35
+ # Move tensors to device; keep integer token IDs as int64; cast only floats
36
+ for k, v in list(inputs.items()):
37
+ if isinstance(v, torch.Tensor):
38
+ inputs[k] = v.to(model.device)
39
+ for k, v in list(inputs.items()):
40
+ if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
41
+ inputs[k] = v.to(dtype=FLOAT_DTYPE)
42
+
43
+ gen_kwargs = {
44
+ "do_sample": False,
45
+ "max_new_tokens": max_new_tokens,
46
+ "eos_token_id": getattr(model.generation_config, "eos_token_id", None) \
47
+ or getattr(processor.tokenizer, "eos_token_id", None),
48
+ "pad_token_id": getattr(model.generation_config, "pad_token_id", None) \
49
+ or getattr(processor.tokenizer, "pad_token_id", None),
50
+ }
51
+
52
+ if torch.cuda.is_available():
53
+ torch.cuda.reset_peak_memory_stats()
54
+
55
+ t0 = time.perf_counter()
56
+ out_ids = model.generate(**inputs, **gen_kwargs)
57
+ latency = time.perf_counter() - t0
58
+
59
+ text = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
60
+ vram_gb = (torch.cuda.max_memory_allocated()/1e9) if torch.cuda.is_available() else 0.0
61
+ tokens_generated = int(out_ids.shape[-1] - inputs["input_ids"].shape[-1])
62
+
63
+ # minimal pretty string
64
+ pretty = (f"Latency: {latency:.3f}s | VRAM: {vram_gb:.2f} GB | Tokens: {tokens_generated}\n"
65
+ f"{'-'*40}\n{text.strip()}")
66
+ return pretty
67
+
68
+ def infer(video, prompt, tokens, frames, backend):
69
+ # gr.Video gives a dict or path depending on version; normalize:
70
+ path = video if isinstance(video, str) else getattr(video, "name", None)
71
+ if not path:
72
+ return "No video file received."
73
+ return run_video(path, prompt, max_new_tokens=tokens, backend=backend, num_frames=frames)
74
+
75
+ with gr.Blocks() as demo:
76
+ gr.Markdown("## SmolVLM2-256M Video Test\nUpload an MP4 and enter your prompt. "
77
+ "This Space mirrors your Colab test.")
78
+ with gr.Row():
79
+ vid = gr.Video(label="Upload MP4", sources=["upload"], include_audio=False)
80
+ with gr.Column():
81
+ prompt = gr.Textbox(label="Prompt", value="Describe this video to me", lines=2)
82
+ tokens = gr.Slider(32, 512, value=256, step=16, label="max_new_tokens")
83
+ frames = gr.Slider(8, 64, value=32, step=8, label="num_frames (sampling)")
84
+ backend = gr.Dropdown(choices=["decord","pyav","opencv","torchvision"],
85
+ value="decord", label="video_load_backend")
86
+ btn = gr.Button("Run")
87
+ out = gr.Textbox(label="Output", lines=15)
88
+ btn.click(fn=infer, inputs=[vid, prompt, tokens, frames, backend], outputs=out)
89
+
90
+ if __name__ == "__main__":
91
+ demo.launch()