initial commit
Browse files- README.md +28 -1
- app.py +106 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -11,4 +11,31 @@ license: apache-2.0
|
|
11 |
short_description: Fall detector with TimeSformer
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
short_description: Fall detector with TimeSformer
|
12 |
---
|
13 |
|
14 |
+
# Video Human Detection Demo using TimeSformer
|
15 |
+
|
16 |
+
This is a Hugging Face Spaces demo project that uses TimeSformer – a video transformer model – for video-based human detection (action recognition). In this demo, we use the pre-trained model `microsoft/timesformer-base-finetuned-k400` from Hugging Face, which has been fine‑tuned on the Kinetics‑400 dataset. The model is capable of classifying a video into one of 400 human action categories.
|
17 |
+
|
18 |
+
## Overview
|
19 |
+
|
20 |
+
- **Model:** We use a TimeSformer model (`microsoft/timesformer-base-finetuned-k400`) to classify video clips.
|
21 |
+
- **Feature Extractor:** The demo employs the Hugging Face `AutoFeatureExtractor` for video to process and prepare video frames.
|
22 |
+
- **Inference:** The model outputs a set of predicted action labels with scores. These predictions help detect human actions in the video.
|
23 |
+
- **Interface:** Built with Gradio, the demo lets the user upload a video file. The application extracts frames from the video, processes them with the model, and displays the top action predictions.
|
24 |
+
|
25 |
+
## Setup and Deployment
|
26 |
+
|
27 |
+
1. **Requirements:** See `requirements.txt` for the list of required packages.
|
28 |
+
2. **Run Locally:** You can run the demo locally using:
|
29 |
+
```bash
|
30 |
+
python app.py
|
31 |
+
```
|
32 |
+
3. **Deploy on Hugging Face Spaces:**
|
33 |
+
Simply push these files to a new repository under HF Spaces. The app is designed to run with ZeroGPU if available and it is fully compatible with CPU-only environments.
|
34 |
+
|
35 |
+
## Notes
|
36 |
+
|
37 |
+
- **Video Preprocessing:** The demo extracts frames using OpenCV and passes them to the feature extractor. The number of frames and the resolution are set to default values that can be adjusted.
|
38 |
+
- **Model Performance:** TimeSformer is computationally heavy – for real-time use, consider using a smaller or distilled variant, or reduce the number of frames processed.
|
39 |
+
- **ZeroGPU Support:** The app uses the `@spaces.GPU` decorator (from the HF Spaces ZeroGPU environment) if available; otherwise, it will run on CPU.
|
40 |
+
|
41 |
+
Enjoy testing human detection in videos with this demo!
|
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces # Import spaces immediately for HF ZeroGPU support.
|
2 |
+
import os
|
3 |
+
import cv2
|
4 |
+
import torch
|
5 |
+
import yt_dlp # (Retained in requirements for potential video fetching use)
|
6 |
+
import numpy as np
|
7 |
+
from PIL import Image
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification
|
11 |
+
|
12 |
+
# Specify the model checkpoint for TimeSformer.
|
13 |
+
MODEL_NAME = "microsoft/timesformer-base-finetuned-k400"
|
14 |
+
|
15 |
+
def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
|
16 |
+
"""
|
17 |
+
Extract up to `num_frames` uniformly-sampled frames from the video.
|
18 |
+
If the video has fewer frames, all frames are returned.
|
19 |
+
"""
|
20 |
+
cap = cv2.VideoCapture(video_path)
|
21 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
22 |
+
frames = []
|
23 |
+
if total_frames <= 0:
|
24 |
+
cap.release()
|
25 |
+
return frames
|
26 |
+
indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
27 |
+
current_frame = 0
|
28 |
+
while True:
|
29 |
+
ret, frame = cap.read()
|
30 |
+
if not ret:
|
31 |
+
break
|
32 |
+
if current_frame in indices:
|
33 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
34 |
+
frame = cv2.resize(frame, target_size)
|
35 |
+
frames.append(Image.fromarray(frame))
|
36 |
+
current_frame += 1
|
37 |
+
cap.release()
|
38 |
+
return frames
|
39 |
+
|
40 |
+
@spaces.GPU
|
41 |
+
def classify_video(video_path):
|
42 |
+
"""
|
43 |
+
Loads the TimeSformer model and feature extractor inside the GPU context,
|
44 |
+
extracts frames from the video, runs inference, and returns the top 5 predicted actions.
|
45 |
+
"""
|
46 |
+
# Load the feature extractor and model inside the GPU context.
|
47 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
|
48 |
+
model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME)
|
49 |
+
model.eval()
|
50 |
+
|
51 |
+
# Determine the device.
|
52 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
53 |
+
model.to(device)
|
54 |
+
|
55 |
+
# Extract frames from the video (here we sample 16 frames).
|
56 |
+
frames = extract_frames(video_path, num_frames=16, target_size=(224, 224))
|
57 |
+
if len(frames) == 0:
|
58 |
+
return "No frames extracted from video."
|
59 |
+
|
60 |
+
# Preprocess the frames.
|
61 |
+
inputs = feature_extractor(frames, return_tensors="pt")
|
62 |
+
inputs = {key: val.to(device) for key, val in inputs.items()}
|
63 |
+
|
64 |
+
# Run inference.
|
65 |
+
with torch.no_grad():
|
66 |
+
outputs = model(**inputs)
|
67 |
+
|
68 |
+
# Compute probabilities from logits.
|
69 |
+
logits = outputs.logits # Shape: [batch_size, num_classes]; batch_size is 1 here.
|
70 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
|
71 |
+
|
72 |
+
# Get the top 5 predictions.
|
73 |
+
top_probs, top_indices = torch.topk(probs, k=5)
|
74 |
+
top_probs = top_probs.cpu().numpy()
|
75 |
+
top_indices = top_indices.cpu().numpy()
|
76 |
+
|
77 |
+
# Retrieve the label mapping from the feature extractor's config.
|
78 |
+
id2label = feature_extractor.feature_extractor_config.get("id2label", {})
|
79 |
+
results = []
|
80 |
+
for idx, prob in zip(top_indices, top_probs):
|
81 |
+
label = id2label.get(str(idx), f"Class {idx}")
|
82 |
+
results.append(f"{label}: {prob:.3f}")
|
83 |
+
|
84 |
+
return "\n".join(results)
|
85 |
+
|
86 |
+
def process_video(video_file):
|
87 |
+
if video_file is None:
|
88 |
+
return "No video provided."
|
89 |
+
result = classify_video(video_file)
|
90 |
+
return result
|
91 |
+
|
92 |
+
# Gradio interface definition.
|
93 |
+
demo = gr.Interface(
|
94 |
+
fn=process_video,
|
95 |
+
inputs=gr.Video(source="upload", label="Upload Video Clip"),
|
96 |
+
outputs=gr.Textbox(label="Predicted Actions"),
|
97 |
+
title="Video Human Detection Demo using TimeSformer",
|
98 |
+
description=(
|
99 |
+
"Upload a video clip to see the top predicted human action labels using the TimeSformer model "
|
100 |
+
"(fine-tuned on Kinetics-400). This demo loads the model and feature extractor within the GPU context "
|
101 |
+
"for optimized inference in Hugging Face ZeroGPU Spaces while also supporting CPU-only environments."
|
102 |
+
)
|
103 |
+
)
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0
|
2 |
+
torch>=2.0.1
|
3 |
+
transformers>=4.25.0
|
4 |
+
opencv-python>=4.5.5
|
5 |
+
Pillow>=8.4.0
|
6 |
+
yt-dlp>=2022.12.1
|