Spaces:

ekabaruh
/

real-time-people-detection

Sleeping

App Files Files Community

ekabaruh commited on May 1

Commit

9867c2c

verified ·

1 Parent(s): f38a2ce

Upload 8 files

Browse files

Files changed (9) hide show

.gitattributes +3 -0
README.md +53 -6
app.py +713 -0
assets/one-by-one-person-detection.mp4 +3 -0
assets/people-detection.mp4 +3 -0
assets/store-aisle-detection.mp4 +3 -0
packages.txt +2 -0
requirements.txt +7 -0
yolov8n.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/one-by-one-person-detection.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/people-detection.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/store-aisle-detection.mp4 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,10 +1,57 @@
 ---
-title: Real Time People Detection
-emoji: 📊
-colorFrom: gray
-colorTo: gray
-sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Real-time People Detection
+emoji: 👁️
+colorFrom: blue
+colorTo: green
+sdk: streamlit
+sdk_version: 1.28.0
+app_file: app.py
 pinned: false
 ---
+# Real-time People Detection
+This Streamlit application demonstrates real-time people detection using YOLOv8, optimized for Hugging Face Spaces.
+## Features
+- Real-time people detection in video streams
+- Uses YOLOv8n - the smallest and fastest YOLOv8 model
+- Interactive controls for detection threshold
+- Adjustable inference rate for performance optimization
+- Performance metrics display (FPS, inference time)
+## How to Use
+1. Select your model (YOLOv8n is default for best performance)
+2. Adjust the detection threshold (higher values = fewer but more confident detections)
+3. Set your target inference FPS (lower values use less resources)
+4. Select a demo video
+5. Click "Start" to begin detection
+6. Use "Stop" to halt the detection process
+## Demo Videos
+The application includes several demo videos:
+- One Person - Simple video with a single person
+- Store Aisle - People walking in a store
+- People Detection - Various people in different settings
+## Performance Notes
+- The YOLOv8n model provides the best balance of speed and accuracy for this application
+- Reducing the target inference FPS will improve performance while maintaining a smooth display
+- The actual inference rate may be lower than the target on resource-constrained environments
+- Detection results are reused between frames when running below the video framerate
+## Deployment on Hugging Face Spaces
+This application is designed to be deployed on Hugging Face Spaces. The deployment automatically uses:
+- Streamlit for the web interface
+- YOLOv8n model for efficient people detection
+- Demo videos for testing (webcam access is disabled in Spaces)
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,713 @@

+"""
+Real-time People Detection Streamlit application.
+This is the main entry point for the Hugging Face Space application.
+"""
+import os
+import time
+from pathlib import Path
+from typing import Tuple, Dict, Any, Optional, List
+import cv2
+import numpy as np
+import streamlit as st
+from PIL import Image
+import torch
+from ultralytics import YOLO
+# Constants
+ASSETS_DIR = Path(__file__).parent / "assets"
+DEMO_VIDEOS = {
+    "One Person": ASSETS_DIR / "one-by-one-person-detection.mp4",
+    "Store Aisle": ASSETS_DIR / "store-aisle-detection.mp4",
+    "People Detection": ASSETS_DIR / "people-detection.mp4"
+}
+FRAME_WIDTH = 640
+FRAME_HEIGHT = 480
+class PeopleDetector:
+    """
+    A class for detecting people in images using a pre-trained YOLOv8n model.
+    Attributes:
+        model_name: Name or path of the YOLOv8 model to use
+        threshold: Confidence threshold for detection
+        device: Device to run inference on (cuda/cpu)
+        model: The detection model
+    """
+    def __init__(
+        self,
+        model_name: str = "yolov8n.pt",
+        threshold: float = 0.5,
+        device: Optional[str] = None,
+    ):
+        """
+        Initialize the people detector with a pre-trained model.
+        Args:
+            model_name: YOLOv8 model name to use ('yolov8n.pt' is the smallest one)
+            threshold: Confidence threshold for detection (0.0 to 1.0)
+            device: Device to run inference on (cuda/cpu). If None, will use cuda if available.
+        """
+        self.model_name = model_name
+        self.threshold = threshold
+        # Determine the device to use
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        # Load the YOLOv8 model
+        self.model = YOLO(model_name)
+        # Person class ID is 0 in COCO (YOLOv8 uses COCO classes)
+        self.person_class_id = 0
+    def detect(self, image: np.ndarray) -> Tuple[List[Dict[str, Any]], float]:
+        """
+        Detect people in an image.
+        Args:
+            image: Input image as numpy array (BGR format from OpenCV)
+        Returns:
+            Tuple containing:
+                - List of detection results with keys 'box', 'score', and 'label'
+                - Inference time in seconds
+        """
+        # Start timing
+        start_time = time.time()
+        # Run inference with YOLOv8
+        results = self.model(image, conf=self.threshold, device=self.device)
+        # Extract detections of people only
+        detections = []
+        # Process the results
+        for result in results:
+            boxes = result.boxes
+            # Extract coordinates, confidence and class
+            for i, box in enumerate(boxes):
+                cls = int(box.cls.item())
+                conf = float(box.conf.item())
+                # Check if it's a person (class 0)
+                if cls == self.person_class_id:
+                    # Get bounding box
+                    x1, y1, x2, y2 = map(int, box.xyxy.tolist()[0])
+                    detections.append({
+                        'box': (x1, y1, x2, y2),
+                        'score': conf,
+                        'label': 'person'
+                    })
+        # Calculate inference time
+        inference_time = time.time() - start_time
+        return detections, inference_time
+    def update_threshold(self, threshold: float) -> None:
+        """
+        Update the detection confidence threshold.
+        Args:
+            threshold: New threshold value (0.0 to 1.0)
+        """
+        self.threshold = threshold
+class VideoSource:
+    """
+    A class for handling video input from different sources (webcam or file).
+    Attributes:
+        source: Camera index (int) or video file path (str)
+        width: Frame width to set (if possible)
+        height: Frame height to set (if possible)
+        fps_buffer_size: Number of frames to average for FPS calculation
+    """
+    def __init__(
+        self,
+        source: Any = 0,
+        width: int = 640,
+        height: int = 480,
+        fps_buffer_size: int = 30,
+    ):
+        """
+        Initialize the video source.
+        Args:
+            source: Camera index (int) or video file path (str)
+            width: Width to set for the captured frames
+            height: Height to set for the captured frames
+            fps_buffer_size: Number of frames to use for FPS averaging
+        """
+        self.source = source
+        self.width = width
+        self.height = height
+        self.fps_buffer_size = fps_buffer_size
+        self.cap = None
+        self.frame_times = []
+        self.is_running = False
+    def start(self) -> bool:
+        """
+        Start the video capture.
+        Returns:
+            bool: True if capture was started successfully, False otherwise
+        """
+        if self.is_running:
+            return True
+        self.cap = cv2.VideoCapture(self.source)
+        if not self.cap.isOpened():
+            return False
+        # Try to set properties if it's a webcam
+        if isinstance(self.source, int):
+            self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.width)
+            self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.height)
+        self.is_running = True
+        self.frame_times = []
+        return True
+    def stop(self) -> None:
+        """Stop the video capture and release resources."""
+        if self.is_running and self.cap is not None:
+            self.cap.release()
+            self.is_running = False
+    def read_frame(self) -> Tuple[bool, Optional[np.ndarray]]:
+        """
+        Read a single frame from the video source.
+        Returns:
+            Tuple containing:
+                - Boolean indicating if frame was successfully read
+                - Image as numpy array (or None if no frame was read)
+        """
+        if not self.is_running or self.cap is None:
+            return False, None
+        # Record time for FPS calculation
+        current_time = time.time()
+        # Read frame
+        ret, frame = self.cap.read()
+        if ret:
+            # Update FPS buffer
+            self.frame_times.append(current_time)
+            if len(self.frame_times) > self.fps_buffer_size:
+                self.frame_times.pop(0)
+        return ret, frame
+    def get_fps(self) -> float:
+        """
+        Calculate the current FPS based on actual frame timings.
+        Returns:
+            float: Current frames per second
+        """
+        if len(self.frame_times) < 2:
+            return 0.0
+        # Calculate FPS from time differences
+        time_diff = self.frame_times[-1] - self.frame_times[0]
+        if time_diff > 0:
+            return (len(self.frame_times) - 1) / time_diff
+        return 0.0
+def draw_detections(
+    image: np.ndarray,
+    detections: List[Dict[str, Any]],
+    color: Tuple[int, int, int] = (0, 255, 0),
+    thickness: int = 2,
+    font_scale: float = 0.5,
+) -> np.ndarray:
+    """
+    Draw bounding boxes and labels for detected people.
+    Args:
+        image: Input image to draw on
+        detections: List of detection results from PeopleDetector
+        color: BGR color tuple for bounding boxes
+        thickness: Line thickness for bounding boxes
+        font_scale: Font scale for text labels
+    Returns:
+        np.ndarray: Image with drawn detections
+    """
+    annotated_image = image.copy()
+    for detection in detections:
+        # Extract bounding box coordinates
+        x_min, y_min, x_max, y_max = detection['box']
+        # Draw bounding box
+        cv2.rectangle(
+            annotated_image,
+            (x_min, y_min),
+            (x_max, y_max),
+            color,
+            thickness
+        )
+        # Create label with confidence score
+        label = f"Person: {detection['score']:.2f}"
+        # Calculate text size and position
+        (text_width, text_height), _ = cv2.getTextSize(
+            label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+        )
+        # Draw label background
+        cv2.rectangle(
+            annotated_image,
+            (x_min, y_min - text_height - 5),
+            (x_min + text_width, y_min),
+            color,
+            -1  # Filled rectangle
+        )
+        # Draw text
+        cv2.putText(
+            annotated_image,
+            label,
+            (x_min, y_min - 5),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            font_scale,
+            (0, 0, 0),  # Black text
+            thickness
+        )
+    return annotated_image
+def add_performance_stats(
+    image: np.ndarray,
+    fps: float,
+    inference_time: float,
+    people_count: int,
+    inference_fps: float = 0.0,
+    bg_color: Tuple[int, int, int] = (0, 0, 0),
+    text_color: Tuple[int, int, int] = (255, 255, 255),
+    font_scale: float = 0.5,
+    thickness: int = 1,
+) -> np.ndarray:
+    """
+    Add performance statistics to the image.
+    Args:
+        image: Input image to add stats to
+        fps: Current FPS value
+        inference_time: Model inference time in seconds
+        people_count: Number of people detected
+        inference_fps: Inference FPS (model predictions per second)
+        bg_color: Background color for stats box
+        text_color: Text color for stats
+        font_scale: Font scale for text
+        thickness: Line thickness for text
+    Returns:
+        np.ndarray: Image with added performance stats
+    """
+    stats_image = image.copy()
+    # Create stats text
+    fps_text = f"FPS: {fps:.1f}"
+    inference_text = f"Inference: {inference_time*1000:.1f}ms"
+    count_text = f"People: {people_count}"
+    inf_fps_text = f"Inference FPS: {inference_fps:.1f}"
+    # Get text sizes
+    (fps_width, fps_height), _ = cv2.getTextSize(
+        fps_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+    )
+    (inf_width, inf_height), _ = cv2.getTextSize(
+        inference_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+    )
+    (count_width, count_height), _ = cv2.getTextSize(
+        count_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+    )
+    (inf_fps_width, inf_fps_height), _ = cv2.getTextSize(
+        inf_fps_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+    )
+    # Calculate background box dimensions
+    box_width = max(fps_width, inf_width, count_width, inf_fps_width) + 20
+    box_height = fps_height + inf_height + count_height + inf_fps_height + 30
+    # Draw background box
+    cv2.rectangle(
+        stats_image,
+        (10, 10),
+        (10 + box_width, 10 + box_height),
+        bg_color,
+        -1  # Filled rectangle
+    )
+    # Draw text
+    y_offset = 10 + fps_height + 5
+    cv2.putText(
+        stats_image,
+        fps_text,
+        (20, y_offset),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        text_color,
+        thickness
+    )
+    y_offset += inf_height + 5
+    cv2.putText(
+        stats_image,
+        inference_text,
+        (20, y_offset),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        text_color,
+        thickness
+    )
+    y_offset += count_height + 5
+    cv2.putText(
+        stats_image,
+        count_text,
+        (20, y_offset),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        text_color,
+        thickness
+    )
+    y_offset += inf_fps_height + 5
+    cv2.putText(
+        stats_image,
+        inf_fps_text,
+        (20, y_offset),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        text_color,
+        thickness
+    )
+    return stats_image
+class PeopleDetectionApp:
+    """
+    Streamlit application for real-time people detection.
+    This class handles the Streamlit UI components and orchestrates
+    the video capture and detection processes.
+    """
+    def __init__(self):
+        """Initialize the Streamlit application components."""
+        # Set page config
+        st.set_page_config(
+            page_title="Real-time People Detection",
+            page_icon="👁️",
+            layout="wide",
+        )
+        # Initialize session state
+        if "video_source" not in st.session_state:
+            st.session_state.video_source = None
+        if "detector" not in st.session_state:
+            st.session_state.detector = None
+        if "is_running" not in st.session_state:
+            st.session_state.is_running = False
+        if "frame_placeholder" not in st.session_state:
+            st.session_state.frame_placeholder = None
+        if "last_inference_time" not in st.session_state:
+            st.session_state.last_inference_time = 0.0
+        if "last_inference_timestamp" not in st.session_state:
+            st.session_state.last_inference_timestamp = 0.0
+        if "frame_count" not in st.session_state:
+            st.session_state.frame_count = 0
+        if "last_frame" not in st.session_state:
+            st.session_state.last_frame = None
+        if "last_detections" not in st.session_state:
+            st.session_state.last_detections = []
+    def create_ui(self):
+        """Create the Streamlit UI components."""
+        # Page header
+        st.title("Real-time People Detection")
+        st.markdown(
+            "This application detects people in video streams using YOLOv8."
+        )
+        # Sidebar for controls
+        with st.sidebar:
+            st.header("Settings")
+            # Model selection
+            model_name = st.selectbox(
+                "Select detection model",
+                options=[
+                    "yolov8n.pt",  # Nano model (smallest)
+                ],
+                index=0,
+            )
+            # Detection threshold
+            detection_threshold = st.slider(
+                "Detection threshold",
+                min_value=0.1,
+                max_value=1.0,
+                value=0.5,
+                step=0.05,
+            )
+            # Target inference FPS
+            target_fps = st.slider(
+                "Target inference FPS",
+                min_value=1,
+                max_value=30,
+                value=10,
+                step=1,
+                help="Control how many frames per second are sent to the model for inference. Lower values use less resources but may appear less smooth."
+            )
+            # For Hugging Face Space, we only provide demo videos (no webcam)
+            source_type = "Demo Video"
+            # Let user select which demo video to use
+            demo_selection = st.selectbox(
+                "Select demo video",
+                options=list(DEMO_VIDEOS.keys()),
+                index=0,
+            )
+            video_path = str(DEMO_VIDEOS[demo_selection])
+            source = video_path
+            # Control buttons
+            col1, col2 = st.columns(2)
+            with col1:
+                start_button = st.button(
+                    "Start" if not st.session_state.is_running else "Restart",
+                    use_container_width=True,
+                )
+            with col2:
+                stop_button = st.button(
+                    "Stop",
+                    use_container_width=True,
+                    disabled=not st.session_state.is_running,
+                )
+        # Main area for video display
+        video_column, stats_column = st.columns([3, 1])
+        with video_column:
+            st.subheader("Detection Feed")
+            # Create a placeholder for the video frame
+            frame_placeholder = st.empty()
+            st.session_state.frame_placeholder = frame_placeholder
+        with stats_column:
+            st.subheader("Performance Stats")
+            # Create placeholders for stats
+            fps_text = st.empty()
+            inference_text = st.empty()
+            people_count = st.empty()
+            inference_fps_text = st.empty()
+        # Handle button actions
+        if start_button:
+            self.start_detection(source, model_name, detection_threshold, target_fps)
+        if stop_button:
+            self.stop_detection()
+        # Return stats placeholders for updating
+        return fps_text, inference_text, people_count, inference_fps_text
+    def start_detection(self, source, model_name, threshold, target_fps):
+        """
+        Start the detection process.
+        Args:
+            source: Video source (camera ID or file path)
+            model_name: YOLOv8 model to use
+            threshold: Detection confidence threshold
+            target_fps: Target frames per second for inference
+        """
+        # Stop existing detection if running
+        self.stop_detection()
+        # Initialize video source
+        video_source = VideoSource(
+            source=source,
+            width=FRAME_WIDTH,
+            height=FRAME_HEIGHT,
+        )
+        # Initialize detector
+        detector = PeopleDetector(
+            model_name=model_name,
+            threshold=threshold,
+        )
+        # Start video capture
+        if not video_source.start():
+            st.error(f"Failed to open video source: {source}")
+            return
+        # Store objects in session state
+        st.session_state.video_source = video_source
+        st.session_state.detector = detector
+        st.session_state.is_running = True
+        st.session_state.target_fps = target_fps
+        st.session_state.last_inference_timestamp = time.time()
+        st.session_state.frame_count = 0
+        st.session_state.last_frame = None
+        st.session_state.last_detections = []
+    def stop_detection(self):
+        """Stop the detection process and release resources."""
+        if st.session_state.video_source is not None:
+            st.session_state.video_source.stop()
+            st.session_state.video_source = None
+        st.session_state.detector = None
+        st.session_state.is_running = False
+        st.session_state.last_frame = None
+        st.session_state.last_detections = []
+    def update_frame(self, fps_text, inference_text, people_count, inference_fps_text):
+        """
+        Update the video frame and stats.
+        Args:
+            fps_text: Streamlit element for FPS display
+            inference_text: Streamlit element for inference time display
+            people_count: Streamlit element for people count display
+            inference_fps_text: Streamlit element for inference FPS display
+        """
+        if not st.session_state.is_running:
+            return
+        video_source = st.session_state.video_source
+        detector = st.session_state.detector
+        target_fps = st.session_state.target_fps
+        if video_source is None or detector is None:
+            return
+        # Read a new frame
+        ret, frame = video_source.read_frame()
+        if not ret:
+            # If we've reached the end of a video file, restart it
+            if not isinstance(video_source.source, int):
+                # Restart video
+                video_source.stop()
+                if video_source.start():
+                    ret, frame = video_source.read_frame()
+                    if not ret:
+                        st.error("Failed to restart video")
+                        self.stop_detection()
+                        return
+                else:
+                    st.error("Failed to restart video source")
+                    self.stop_detection()
+                    return
+            else:
+                st.error("Failed to read frame from camera")
+                self.stop_detection()
+                return
+        # Calculate current FPS
+        fps = video_source.get_fps()
+        # Determine if we should run inference on this frame
+        current_time = time.time()
+        time_since_last_inference = current_time - st.session_state.last_inference_timestamp
+        inference_interval = 1.0 / target_fps
+        # Use cached detections or run new detection
+        detections = []
+        inference_time = 0
+        # Run a new detection if enough time has passed
+        if time_since_last_inference >= inference_interval:
+            detections, inference_time = detector.detect(frame)
+            # Update cache
+            st.session_state.last_frame = frame.copy()
+            st.session_state.last_detections = detections
+            st.session_state.last_inference_time = inference_time
+            st.session_state.last_inference_timestamp = current_time
+        else:
+            # Use cached detections
+            detections = st.session_state.last_detections
+            inference_time = st.session_state.last_inference_time
+        # Draw detections on the frame
+        frame_with_detections = draw_detections(frame, detections)
+        # Calculate inference FPS
+        if time_since_last_inference > 0:
+            inference_fps = 1.0 / time_since_last_inference
+        else:
+            inference_fps = 0.0
+        # Add performance stats to the frame
+        frame_with_stats = add_performance_stats(
+            frame_with_detections,
+            fps,
+            inference_time,
+            len(detections),
+            inference_fps
+        )
+        # Display the frame
+        st.session_state.frame_placeholder.image(
+            frame_with_stats,
+            channels="BGR",
+            use_column_width=True
+        )
+        # Update stats
+        fps_text.metric("FPS", f"{fps:.1f}")
+        inference_text.metric("Inference Time", f"{inference_time*1000:.1f} ms")
+        people_count.metric("People Detected", len(detections))
+        inference_fps_text.metric("Inference FPS", f"{inference_fps:.1f}")
+        # Increment frame counter
+        st.session_state.frame_count += 1
+def main():
+    """Main entry point for the application."""
+    app = PeopleDetectionApp()
+    fps_text, inference_text, people_count, inference_fps_text = app.create_ui()
+    # Infinite loop for updating the video frame
+    while st.session_state.is_running:
+        app.update_frame(fps_text, inference_text, people_count, inference_fps_text)
+        time.sleep(0.01)  # Small delay to prevent overloading the CPU
+if __name__ == "__main__":
+    main()

assets/one-by-one-person-detection.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5964aa259099a482a8b360ffc2c57b5a30f84d5919236a4dad01f8e929ac07c
+size 3291918

assets/people-detection.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18ffe8672d741e3e29c9d891d22c59d453720b086c25b35c88b393d55f92f693
+size 5482579

assets/store-aisle-detection.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3526fee39cac70d6366471e4324a6d63b337c8f8ab99c015521bee0d60ed6e04
+size 9214573

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libgl1-mesa-glx
2	+ libglib2.0-0

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+opencv-python-headless>=4.8.0
+numpy>=1.24.0
+streamlit>=1.25.0
+torch>=2.0.0
+torchvision>=0.15.0
+pillow>=10.0.0
+ultralytics>=8.0.0

yolov8n.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f59b3d833e2ff32e194b5bb8e08d211dc7c5bdf144b90d2c8412c47ccfc83b36
+size 6549796