Video-Analysis-AppleFastVLM-7B

Sleeping

App Files Files Community

rahul7star commited on 13 days ago

Commit

4ea68ce

verified ·

1 Parent(s): 920c71d

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -15

app.py CHANGED Viewed

@@ -4,12 +4,18 @@ from PIL import Image
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import cv2
 import numpy as np
-import os
 MID = "apple/FastVLM-7B"
 IMAGE_TOKEN_INDEX = -200
-# Initialize model variables
 tok = None
 model = None
@@ -17,25 +23,28 @@ model = None
 def load_model():
     global tok, model
     if tok is None or model is None:
-        print("Loading FastVLM model (CPU only)...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MID,
-            torch_dtype=torch.float32,   # ✅ CPU-friendly dtype
             device_map="cpu",            # ✅ Force CPU
             trust_remote_code=True,
         )
-        print("Model loaded successfully on CPU!")
     return tok, model
 # ---------------- Frame Extraction ----------------
 def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
     cap = cv2.VideoCapture(video_path)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     if total_frames == 0:
         cap.release()
         return []
     frames = []
@@ -50,12 +59,17 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
         start = max(0, (total_frames - num_frames) // 2)
         indices = list(range(start, min(start + num_frames, total_frames)))
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
     cap.release()
     return frames
@@ -65,6 +79,8 @@ def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str =
 def caption_frame(image: Image.Image, prompt: str) -> str:
     tok, model = load_model()
     messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
     rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
     pre, post = rendered.split("<image>", 1)
@@ -88,10 +104,14 @@ def caption_frame(image: Image.Image, prompt: str) -> str:
             do_sample=True,
         )
-    caption = tok.decode(out[0], skip_special_tokens=True)
     if prompt in caption:
         caption = caption.split(prompt)[-1].strip()
     return caption
@@ -99,13 +119,16 @@ def caption_frame(image: Image.Image, prompt: str) -> str:
 def process_video(video_path, num_frames, sampling_method, chat_history, progress=gr.Progress()):
     if not video_path:
         chat_history.append(["Assistant", "Please upload a video first."])
         return chat_history, None
     progress(0, desc="Extracting frames...")
     frames = extract_frames(video_path, num_frames, sampling_method)
     if not frames:
         chat_history.append(["Assistant", "Failed to extract frames."])
         return chat_history, None
     prompt = "Provide a brief one-sentence description of what's happening in this image."
@@ -117,29 +140,28 @@ def process_video(video_path, num_frames, sampling_method, chat_history, progres
         captions.append(f"Frame {i+1}: {caption}")
         chat_history[-1] = ["Assistant", "\n".join(captions)]
         progress((i + 1) / len(frames))
     progress(1.0, desc="Analysis complete!")
     return chat_history, frames
-# ---------------- Custom Apple-like Theme ----------------
 class AppleTheme(gr.themes.Base):
     def __init__(self):
         super().__init__(
             primary_hue=gr.themes.colors.blue,
             secondary_hue=gr.themes.colors.gray,
             neutral_hue=gr.themes.colors.gray,
-            spacing_size=gr.themes.sizes.spacing_md,
-            radius_size=gr.themes.sizes.radius_md,
-            text_size=gr.themes.sizes.text_md,
-            font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "Helvetica Neue", "Arial", "sans-serif"],
-            font_mono=[gr.themes.GoogleFont("SF Mono"), "Consolas", "monospace"]
         )
-# ---------------- Gradio UI ----------------
 with gr.Blocks(theme=AppleTheme()) as demo:
-    gr.Markdown("# 🎬 FastVLM Video Captioning (CPU Only)")
     with gr.Row():
         with gr.Column(scale=7):
@@ -168,7 +190,7 @@ with gr.Blocks(theme=AppleTheme()) as demo:
 # ---------------- Launch ----------------
 demo.launch(
-    server_name="0.0.0.0",  # Spaces/containers need this
     server_port=7860,
     share=False,
     show_error=True

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import cv2
 import numpy as np
+import logging
+# ---------------- Logging Setup ----------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.StreamHandler()]
+)
 MID = "apple/FastVLM-7B"
 IMAGE_TOKEN_INDEX = -200
 tok = None
 model = None
 def load_model():
     global tok, model
     if tok is None or model is None:
+        logging.info("Loading FastVLM model (CPU only)...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MID,
+            torch_dtype=torch.float32,   # ✅ CPU-friendly
             device_map="cpu",            # ✅ Force CPU
             trust_remote_code=True,
         )
+        logging.info("✅ Model loaded successfully on CPU")
     return tok, model
 # ---------------- Frame Extraction ----------------
 def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
+    logging.info(f"Extracting up to {num_frames} frames using '{sampling_method}' sampling")
     cap = cv2.VideoCapture(video_path)
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    logging.info(f"Total frames in video: {total_frames}")
     if total_frames == 0:
         cap.release()
+        logging.warning("⚠️ No frames found in video")
         return []
     frames = []
         start = max(0, (total_frames - num_frames) // 2)
         indices = list(range(start, min(start + num_frames, total_frames)))
+    logging.info(f"Selected frame indices: {indices}")
     for idx in indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             frames.append(Image.fromarray(frame_rgb))
+            logging.info(f"✅ Extracted frame {idx}")
+        else:
+            logging.warning(f"⚠️ Failed to extract frame {idx}")
     cap.release()
     return frames
 def caption_frame(image: Image.Image, prompt: str) -> str:
     tok, model = load_model()
+    logging.info(f"Captioning frame with prompt: {prompt!r}")
     messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
     rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
     pre, post = rendered.split("<image>", 1)
             do_sample=True,
         )
+    raw_output = tok.decode(out[0], skip_special_tokens=True)
+    logging.info(f"Raw model output: {raw_output!r}")
+    caption = raw_output
     if prompt in caption:
         caption = caption.split(prompt)[-1].strip()
+    logging.info(f"✅ Final cleaned caption: {caption!r}")
     return caption
 def process_video(video_path, num_frames, sampling_method, chat_history, progress=gr.Progress()):
     if not video_path:
         chat_history.append(["Assistant", "Please upload a video first."])
+        logging.warning("No video uploaded")
         return chat_history, None
+    logging.info(f"Starting analysis of video: {video_path}")
     progress(0, desc="Extracting frames...")
     frames = extract_frames(video_path, num_frames, sampling_method)
     if not frames:
         chat_history.append(["Assistant", "Failed to extract frames."])
+        logging.error("No frames extracted")
         return chat_history, None
     prompt = "Provide a brief one-sentence description of what's happening in this image."
         captions.append(f"Frame {i+1}: {caption}")
         chat_history[-1] = ["Assistant", "\n".join(captions)]
         progress((i + 1) / len(frames))
+        logging.info(f"Progress: frame {i+1}/{len(frames)} analyzed")
+    final_summary = "\n".join(captions)
+    logging.info("✅ Video analysis complete")
+    logging.info(f"Final summary:\n{final_summary}")
     progress(1.0, desc="Analysis complete!")
     return chat_history, frames
+# ---------------- Gradio UI ----------------
 class AppleTheme(gr.themes.Base):
     def __init__(self):
         super().__init__(
             primary_hue=gr.themes.colors.blue,
             secondary_hue=gr.themes.colors.gray,
             neutral_hue=gr.themes.colors.gray,
         )
 with gr.Blocks(theme=AppleTheme()) as demo:
+    gr.Markdown("# 🎬 FastVLM Video Captioning (CPU Only, with Logs)")
     with gr.Row():
         with gr.Column(scale=7):
 # ---------------- Launch ----------------
 demo.launch(
+    server_name="0.0.0.0",
     server_port=7860,
     share=False,
     show_error=True