Luigi commited on
Commit
288784f
·
1 Parent(s): 859b909

initial commit

Browse files
Files changed (3) hide show
  1. README.md +28 -1
  2. app.py +106 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -11,4 +11,31 @@ license: apache-2.0
11
  short_description: Fall detector with TimeSformer
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: Fall detector with TimeSformer
12
  ---
13
 
14
+ # Video Human Detection Demo using TimeSformer
15
+
16
+ This is a Hugging Face Spaces demo project that uses TimeSformer – a video transformer model – for video-based human detection (action recognition). In this demo, we use the pre-trained model `microsoft/timesformer-base-finetuned-k400` from Hugging Face, which has been fine‑tuned on the Kinetics‑400 dataset. The model is capable of classifying a video into one of 400 human action categories.
17
+
18
+ ## Overview
19
+
20
+ - **Model:** We use a TimeSformer model (`microsoft/timesformer-base-finetuned-k400`) to classify video clips.
21
+ - **Feature Extractor:** The demo employs the Hugging Face `AutoFeatureExtractor` for video to process and prepare video frames.
22
+ - **Inference:** The model outputs a set of predicted action labels with scores. These predictions help detect human actions in the video.
23
+ - **Interface:** Built with Gradio, the demo lets the user upload a video file. The application extracts frames from the video, processes them with the model, and displays the top action predictions.
24
+
25
+ ## Setup and Deployment
26
+
27
+ 1. **Requirements:** See `requirements.txt` for the list of required packages.
28
+ 2. **Run Locally:** You can run the demo locally using:
29
+ ```bash
30
+ python app.py
31
+ ```
32
+ 3. **Deploy on Hugging Face Spaces:**
33
+ Simply push these files to a new repository under HF Spaces. The app is designed to run with ZeroGPU if available and it is fully compatible with CPU-only environments.
34
+
35
+ ## Notes
36
+
37
+ - **Video Preprocessing:** The demo extracts frames using OpenCV and passes them to the feature extractor. The number of frames and the resolution are set to default values that can be adjusted.
38
+ - **Model Performance:** TimeSformer is computationally heavy – for real-time use, consider using a smaller or distilled variant, or reduce the number of frames processed.
39
+ - **ZeroGPU Support:** The app uses the `@spaces.GPU` decorator (from the HF Spaces ZeroGPU environment) if available; otherwise, it will run on CPU.
40
+
41
+ Enjoy testing human detection in videos with this demo!
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces # Import spaces immediately for HF ZeroGPU support.
2
+ import os
3
+ import cv2
4
+ import torch
5
+ import yt_dlp # (Retained in requirements for potential video fetching use)
6
+ import numpy as np
7
+ from PIL import Image
8
+ import gradio as gr
9
+
10
+ from transformers import AutoFeatureExtractor, AutoModelForVideoClassification
11
+
12
+ # Specify the model checkpoint for TimeSformer.
13
+ MODEL_NAME = "microsoft/timesformer-base-finetuned-k400"
14
+
15
+ def extract_frames(video_path, num_frames=16, target_size=(224, 224)):
16
+ """
17
+ Extract up to `num_frames` uniformly-sampled frames from the video.
18
+ If the video has fewer frames, all frames are returned.
19
+ """
20
+ cap = cv2.VideoCapture(video_path)
21
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
22
+ frames = []
23
+ if total_frames <= 0:
24
+ cap.release()
25
+ return frames
26
+ indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
27
+ current_frame = 0
28
+ while True:
29
+ ret, frame = cap.read()
30
+ if not ret:
31
+ break
32
+ if current_frame in indices:
33
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
34
+ frame = cv2.resize(frame, target_size)
35
+ frames.append(Image.fromarray(frame))
36
+ current_frame += 1
37
+ cap.release()
38
+ return frames
39
+
40
+ @spaces.GPU
41
+ def classify_video(video_path):
42
+ """
43
+ Loads the TimeSformer model and feature extractor inside the GPU context,
44
+ extracts frames from the video, runs inference, and returns the top 5 predicted actions.
45
+ """
46
+ # Load the feature extractor and model inside the GPU context.
47
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
48
+ model = AutoModelForVideoClassification.from_pretrained(MODEL_NAME)
49
+ model.eval()
50
+
51
+ # Determine the device.
52
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
53
+ model.to(device)
54
+
55
+ # Extract frames from the video (here we sample 16 frames).
56
+ frames = extract_frames(video_path, num_frames=16, target_size=(224, 224))
57
+ if len(frames) == 0:
58
+ return "No frames extracted from video."
59
+
60
+ # Preprocess the frames.
61
+ inputs = feature_extractor(frames, return_tensors="pt")
62
+ inputs = {key: val.to(device) for key, val in inputs.items()}
63
+
64
+ # Run inference.
65
+ with torch.no_grad():
66
+ outputs = model(**inputs)
67
+
68
+ # Compute probabilities from logits.
69
+ logits = outputs.logits # Shape: [batch_size, num_classes]; batch_size is 1 here.
70
+ probs = torch.nn.functional.softmax(logits, dim=-1)[0]
71
+
72
+ # Get the top 5 predictions.
73
+ top_probs, top_indices = torch.topk(probs, k=5)
74
+ top_probs = top_probs.cpu().numpy()
75
+ top_indices = top_indices.cpu().numpy()
76
+
77
+ # Retrieve the label mapping from the feature extractor's config.
78
+ id2label = feature_extractor.feature_extractor_config.get("id2label", {})
79
+ results = []
80
+ for idx, prob in zip(top_indices, top_probs):
81
+ label = id2label.get(str(idx), f"Class {idx}")
82
+ results.append(f"{label}: {prob:.3f}")
83
+
84
+ return "\n".join(results)
85
+
86
+ def process_video(video_file):
87
+ if video_file is None:
88
+ return "No video provided."
89
+ result = classify_video(video_file)
90
+ return result
91
+
92
+ # Gradio interface definition.
93
+ demo = gr.Interface(
94
+ fn=process_video,
95
+ inputs=gr.Video(source="upload", label="Upload Video Clip"),
96
+ outputs=gr.Textbox(label="Predicted Actions"),
97
+ title="Video Human Detection Demo using TimeSformer",
98
+ description=(
99
+ "Upload a video clip to see the top predicted human action labels using the TimeSformer model "
100
+ "(fine-tuned on Kinetics-400). This demo loads the model and feature extractor within the GPU context "
101
+ "for optimized inference in Hugging Face ZeroGPU Spaces while also supporting CPU-only environments."
102
+ )
103
+ )
104
+
105
+ if __name__ == "__main__":
106
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0
2
+ torch>=2.0.1
3
+ transformers>=4.25.0
4
+ opencv-python>=4.5.5
5
+ Pillow>=8.4.0
6
+ yt-dlp>=2022.12.1