rahul7star commited on
Commit
4ea68ce
·
verified ·
1 Parent(s): 920c71d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -15
app.py CHANGED
@@ -4,12 +4,18 @@ from PIL import Image
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import cv2
6
  import numpy as np
7
- import os
 
 
 
 
 
 
 
8
 
9
  MID = "apple/FastVLM-7B"
10
  IMAGE_TOKEN_INDEX = -200
11
 
12
- # Initialize model variables
13
  tok = None
14
  model = None
15
 
@@ -17,25 +23,28 @@ model = None
17
  def load_model():
18
  global tok, model
19
  if tok is None or model is None:
20
- print("Loading FastVLM model (CPU only)...")
21
  tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
22
  model = AutoModelForCausalLM.from_pretrained(
23
  MID,
24
- torch_dtype=torch.float32, # ✅ CPU-friendly dtype
25
  device_map="cpu", # ✅ Force CPU
26
  trust_remote_code=True,
27
  )
28
- print("Model loaded successfully on CPU!")
29
  return tok, model
30
 
31
 
32
  # ---------------- Frame Extraction ----------------
33
  def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
 
34
  cap = cv2.VideoCapture(video_path)
35
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
36
 
37
  if total_frames == 0:
38
  cap.release()
 
39
  return []
40
 
41
  frames = []
@@ -50,12 +59,17 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
50
  start = max(0, (total_frames - num_frames) // 2)
51
  indices = list(range(start, min(start + num_frames, total_frames)))
52
 
 
 
53
  for idx in indices:
54
  cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
55
  ret, frame = cap.read()
56
  if ret:
57
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
58
  frames.append(Image.fromarray(frame_rgb))
 
 
 
59
 
60
  cap.release()
61
  return frames
@@ -65,6 +79,8 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
65
  def caption_frame(image: Image.Image, prompt: str) -> str:
66
  tok, model = load_model()
67
 
 
 
68
  messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
69
  rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
70
  pre, post = rendered.split("<image>", 1)
@@ -88,10 +104,14 @@ def caption_frame(image: Image.Image, prompt: str) -> str:
88
  do_sample=True,
89
  )
90
 
91
- caption = tok.decode(out[0], skip_special_tokens=True)
 
 
 
92
  if prompt in caption:
93
  caption = caption.split(prompt)[-1].strip()
94
 
 
95
  return caption
96
 
97
 
@@ -99,13 +119,16 @@ def caption_frame(image: Image.Image, prompt: str) -> str:
99
  def process_video(video_path, num_frames, sampling_method, chat_history, progress=gr.Progress()):
100
  if not video_path:
101
  chat_history.append(["Assistant", "Please upload a video first."])
 
102
  return chat_history, None
103
 
 
104
  progress(0, desc="Extracting frames...")
105
  frames = extract_frames(video_path, num_frames, sampling_method)
106
 
107
  if not frames:
108
  chat_history.append(["Assistant", "Failed to extract frames."])
 
109
  return chat_history, None
110
 
111
  prompt = "Provide a brief one-sentence description of what's happening in this image."
@@ -117,29 +140,28 @@ def process_video(video_path, num_frames, sampling_method, chat_history, progres
117
  captions.append(f"Frame {i+1}: {caption}")
118
  chat_history[-1] = ["Assistant", "\n".join(captions)]
119
  progress((i + 1) / len(frames))
 
 
 
 
 
120
 
121
  progress(1.0, desc="Analysis complete!")
122
  return chat_history, frames
123
 
124
 
125
- # ---------------- Custom Apple-like Theme ----------------
126
  class AppleTheme(gr.themes.Base):
127
  def __init__(self):
128
  super().__init__(
129
  primary_hue=gr.themes.colors.blue,
130
  secondary_hue=gr.themes.colors.gray,
131
  neutral_hue=gr.themes.colors.gray,
132
- spacing_size=gr.themes.sizes.spacing_md,
133
- radius_size=gr.themes.sizes.radius_md,
134
- text_size=gr.themes.sizes.text_md,
135
- font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "Helvetica Neue", "Arial", "sans-serif"],
136
- font_mono=[gr.themes.GoogleFont("SF Mono"), "Consolas", "monospace"]
137
  )
138
 
139
 
140
- # ---------------- Gradio UI ----------------
141
  with gr.Blocks(theme=AppleTheme()) as demo:
142
- gr.Markdown("# 🎬 FastVLM Video Captioning (CPU Only)")
143
 
144
  with gr.Row():
145
  with gr.Column(scale=7):
@@ -168,7 +190,7 @@ with gr.Blocks(theme=AppleTheme()) as demo:
168
 
169
  # ---------------- Launch ----------------
170
  demo.launch(
171
- server_name="0.0.0.0", # Spaces/containers need this
172
  server_port=7860,
173
  share=False,
174
  show_error=True
 
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import cv2
6
  import numpy as np
7
+ import logging
8
+
9
+ # ---------------- Logging Setup ----------------
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s [%(levelname)s] %(message)s",
13
+ handlers=[logging.StreamHandler()]
14
+ )
15
 
16
  MID = "apple/FastVLM-7B"
17
  IMAGE_TOKEN_INDEX = -200
18
 
 
19
  tok = None
20
  model = None
21
 
 
23
  def load_model():
24
  global tok, model
25
  if tok is None or model is None:
26
+ logging.info("Loading FastVLM model (CPU only)...")
27
  tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
28
  model = AutoModelForCausalLM.from_pretrained(
29
  MID,
30
+ torch_dtype=torch.float32, # ✅ CPU-friendly
31
  device_map="cpu", # ✅ Force CPU
32
  trust_remote_code=True,
33
  )
34
+ logging.info("Model loaded successfully on CPU")
35
  return tok, model
36
 
37
 
38
  # ---------------- Frame Extraction ----------------
39
  def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
40
+ logging.info(f"Extracting up to {num_frames} frames using '{sampling_method}' sampling")
41
  cap = cv2.VideoCapture(video_path)
42
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
43
+ logging.info(f"Total frames in video: {total_frames}")
44
 
45
  if total_frames == 0:
46
  cap.release()
47
+ logging.warning("⚠️ No frames found in video")
48
  return []
49
 
50
  frames = []
 
59
  start = max(0, (total_frames - num_frames) // 2)
60
  indices = list(range(start, min(start + num_frames, total_frames)))
61
 
62
+ logging.info(f"Selected frame indices: {indices}")
63
+
64
  for idx in indices:
65
  cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
66
  ret, frame = cap.read()
67
  if ret:
68
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
69
  frames.append(Image.fromarray(frame_rgb))
70
+ logging.info(f"✅ Extracted frame {idx}")
71
+ else:
72
+ logging.warning(f"⚠️ Failed to extract frame {idx}")
73
 
74
  cap.release()
75
  return frames
 
79
  def caption_frame(image: Image.Image, prompt: str) -> str:
80
  tok, model = load_model()
81
 
82
+ logging.info(f"Captioning frame with prompt: {prompt!r}")
83
+
84
  messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
85
  rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
86
  pre, post = rendered.split("<image>", 1)
 
104
  do_sample=True,
105
  )
106
 
107
+ raw_output = tok.decode(out[0], skip_special_tokens=True)
108
+ logging.info(f"Raw model output: {raw_output!r}")
109
+
110
+ caption = raw_output
111
  if prompt in caption:
112
  caption = caption.split(prompt)[-1].strip()
113
 
114
+ logging.info(f"✅ Final cleaned caption: {caption!r}")
115
  return caption
116
 
117
 
 
119
  def process_video(video_path, num_frames, sampling_method, chat_history, progress=gr.Progress()):
120
  if not video_path:
121
  chat_history.append(["Assistant", "Please upload a video first."])
122
+ logging.warning("No video uploaded")
123
  return chat_history, None
124
 
125
+ logging.info(f"Starting analysis of video: {video_path}")
126
  progress(0, desc="Extracting frames...")
127
  frames = extract_frames(video_path, num_frames, sampling_method)
128
 
129
  if not frames:
130
  chat_history.append(["Assistant", "Failed to extract frames."])
131
+ logging.error("No frames extracted")
132
  return chat_history, None
133
 
134
  prompt = "Provide a brief one-sentence description of what's happening in this image."
 
140
  captions.append(f"Frame {i+1}: {caption}")
141
  chat_history[-1] = ["Assistant", "\n".join(captions)]
142
  progress((i + 1) / len(frames))
143
+ logging.info(f"Progress: frame {i+1}/{len(frames)} analyzed")
144
+
145
+ final_summary = "\n".join(captions)
146
+ logging.info("✅ Video analysis complete")
147
+ logging.info(f"Final summary:\n{final_summary}")
148
 
149
  progress(1.0, desc="Analysis complete!")
150
  return chat_history, frames
151
 
152
 
153
+ # ---------------- Gradio UI ----------------
154
  class AppleTheme(gr.themes.Base):
155
  def __init__(self):
156
  super().__init__(
157
  primary_hue=gr.themes.colors.blue,
158
  secondary_hue=gr.themes.colors.gray,
159
  neutral_hue=gr.themes.colors.gray,
 
 
 
 
 
160
  )
161
 
162
 
 
163
  with gr.Blocks(theme=AppleTheme()) as demo:
164
+ gr.Markdown("# 🎬 FastVLM Video Captioning (CPU Only, with Logs)")
165
 
166
  with gr.Row():
167
  with gr.Column(scale=7):
 
190
 
191
  # ---------------- Launch ----------------
192
  demo.launch(
193
+ server_name="0.0.0.0",
194
  server_port=7860,
195
  share=False,
196
  show_error=True