Video-Analysis-AppleFastVLM-7B

Sleeping

App Files Files Community

Video-Analysis-AppleFastVLM-7B / app1.py

rahul7star

Update app1.py

891e92e verified 10 days ago

raw

history blame contribute delete

8.48 kB

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import cv2
	import numpy as np
	import logging
	from huggingface_hub import HfApi, upload_file
	import uuid
	import os

	# ---------------- Logging Setup ----------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[logging.StreamHandler()]
	)

	MID = "apple/FastVLM-7B"
	IMAGE_TOKEN_INDEX = -200

	# Read HF repo from secret / env variable
	HF_MODEL = os.environ.get("HF_UPLOAD_REPO", "rahul7star/ImageExplain")

	tok = None
	model = None

	# ---------------- Load Model ----------------
	def load_model():
	global tok, model
	if tok is None or model is None:
	logging.info("Loading FastVLM model (CPU only)...")
	tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MID,
	torch_dtype=torch.float32, # ✅ CPU-friendly
	device_map="cpu", # ✅ Force CPU
	trust_remote_code=True,
	)
	logging.info("✅ Model loaded successfully on CPU")
	return tok, model

	# ---------------- Frame Extraction ----------------
	def extract_frames(video_path: str, num_frames: int = 8, sampling_method: str = "uniform"):
	logging.info(f"Extracting up to {num_frames} frames using '{sampling_method}' sampling")
	cap = cv2.VideoCapture(video_path)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	logging.info(f"Total frames in video: {total_frames}")

	if total_frames == 0:
	cap.release()
	logging.warning("⚠️ No frames found in video")
	return []

	frames = []
	if sampling_method == "uniform":
	indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
	elif sampling_method == "first":
	indices = list(range(min(num_frames, total_frames)))
	elif sampling_method == "last":
	start = max(0, total_frames - num_frames)
	indices = list(range(start, total_frames))
	else: # middle
	start = max(0, (total_frames - num_frames) // 2)
	indices = list(range(start, min(start + num_frames, total_frames)))

	logging.info(f"Selected frame indices: {indices}")

	for idx in indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	ret, frame = cap.read()
	if ret:
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(Image.fromarray(frame_rgb))
	logging.info(f"✅ Extracted frame {idx}")
	else:
	logging.warning(f"⚠️ Failed to extract frame {idx}")

	cap.release()
	return frames

	# ---------------- Caption Frame ----------------
	def caption_frame(image: Image.Image, prompt: str) -> str:
	tok, model = load_model()
	logging.info(f"Captioning frame with prompt: {prompt!r}")

	messages = [{"role": "user", "content": f"<image>\n{prompt}"}]
	rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
	pre, post = rendered.split("<image>", 1)

	pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
	post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
	img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
	input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1)

	attention_mask = torch.ones_like(input_ids)
	px = model.get_vision_tower().image_processor(images=image, return_tensors="pt")["pixel_values"]

	with torch.no_grad():
	out = model.generate(
	inputs=input_ids,
	attention_mask=attention_mask,
	images=px,
	max_new_tokens=15,
	temperature=0.7,
	do_sample=True,
	)

	raw_output = tok.decode(out[0], skip_special_tokens=True)
	caption = raw_output
	if prompt in caption:
	caption = caption.split(prompt)[-1].strip()

	logging.info(f"✅ Final cleaned caption: {caption!r}")
	return caption

	# ---------------- Upload to Hugging Face ----------------
	from huggingface_hub import HfApi, upload_file
	import os
	import uuid
	import os
	import uuid
	import logging
	from datetime import datetime
	import tempfile
	from huggingface_hub import HfApi, upload_file

	def upload_to_hf(video_path, summary_text):
	api = HfApi()

	today_str = datetime.now().strftime("%Y-%m-%d")
	date_folder = f"{today_str}-APPLE-Video_FOLDER"

	# Unique subfolder for this upload
	unique_subfolder = f"upload_{uuid.uuid4().hex[:8]}"
	hf_folder = f"{date_folder}/{unique_subfolder}"
	logging.info(f"Uploading files to HF folder: {hf_folder} in repo {HF_MODEL}")

	# Upload video
	video_filename = os.path.basename(video_path)
	video_hf_path = f"{hf_folder}/{video_filename}"
	upload_file(
	path_or_fileobj=video_path,
	path_in_repo=video_hf_path,
	repo_id=HF_MODEL,
	repo_type="model",
	token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
	)
	logging.info(f"✅ Uploaded video to HF: {video_hf_path}")

	# Upload summary.txt
	summary_file = "/tmp/summary.txt"
	with open(summary_file, "w", encoding="utf-8") as f:
	f.write(summary_text)

	summary_hf_path = f"{hf_folder}/summary.txt"
	upload_file(
	path_or_fileobj=summary_file,
	path_in_repo=summary_hf_path,
	repo_id=HF_MODEL,
	repo_type="model",
	token=os.environ.get("HUGGINGFACE_HUB_TOKEN"),
	)
	logging.info(f"✅ Uploaded summary to HF: {summary_hf_path}")

	return hf_folder

	# ---------------- Process Video ----------------
	def process_video(video_path, num_frames, sampling_method, chat_history, progress=gr.Progress()):
	if not video_path:
	chat_history.append(["Assistant", "Please upload a video first."])
	logging.warning("No video uploaded")
	return chat_history, None

	logging.info(f"Starting analysis of video: {video_path}")
	progress(0, desc="Extracting frames...")
	frames = extract_frames(video_path, num_frames, sampling_method)

	if not frames:
	chat_history.append(["Assistant", "Failed to extract frames."])
	logging.error("No frames extracted")
	return chat_history, None

	prompt = "Provide a brief one-sentence description of what's happening in this image."
	captions = []

	chat_history.append(["Assistant", "Analyzing frames..."])
	for i, frame in enumerate(frames):
	caption = caption_frame(frame, prompt)
	captions.append(f"Frame {i+1}: {caption}")
	chat_history[-1] = ["Assistant", "\n".join(captions)]
	progress((i + 1) / len(frames))
	logging.info(f"Progress: frame {i+1}/{len(frames)} analyzed")

	final_summary = "\n".join(captions)
	logging.info("✅ Video analysis complete")
	logging.info(f"Final summary:\n{final_summary}")

	# Upload video + summary
	hf_folder = upload_to_hf(video_path, final_summary)
	chat_history.append(["Assistant", f"✅ Video and summary uploaded to HF folder: {hf_folder}"])

	progress(1.0, desc="Analysis complete!")
	return chat_history, frames

	# ---------------- Gradio UI ----------------
	class AppleTheme(gr.themes.Base):
	def __init__(self):
	super().__init__(
	primary_hue=gr.themes.colors.blue,
	secondary_hue=gr.themes.colors.gray,
	neutral_hue=gr.themes.colors.gray,
	)

	with gr.Blocks(theme=AppleTheme()) as demo:
	gr.Markdown("# 🎬 FastVLM Video Captioning (CPU Only, with HF Upload)")

	with gr.Row():
	with gr.Column(scale=7):
	video_display = gr.Video(label="Video Input", autoplay=True, loop=True)

	with gr.Sidebar(width=400):
	chatbot = gr.Chatbot(
	value=[["Assistant", "Upload a video and I'll analyze it for you!"]],
	height=400
	)
	process_btn = gr.Button("🎯 Analyze Video", variant="primary")

	with gr.Accordion("🖼️ Analyzed Frames", open=False):
	frame_gallery = gr.Gallery(columns=2, rows=4, height="auto")

	num_frames = gr.State(value=4)
	sampling_method = gr.State(value="uniform")

	process_btn.click(
	fn=process_video,
	inputs=[video_display, num_frames, sampling_method, chatbot],
	outputs=[chatbot, frame_gallery],
	show_progress=True
	)

	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)