Spaces:

ekabaruh
/

real-time-people-detection

Sleeping

App Files Files Community

ekabaruh commited on May 1

Commit

3643479

verified ·

1 Parent(s): 1d5e702

Update app.py

Browse files

Files changed (1) hide show

app.py +468 -203

app.py CHANGED Viewed

@@ -1,44 +1,55 @@
 """
-Real-time People Detection App for Hugging Face Space
-This application detects people in images and videos using YOLOv8 from Ultralytics.
-The app provides an interface for uploading images or using webcam for real-time detection.
 """
 import os
 import time
 import cv2
 import numpy as np
-import gradio as gr
-import torch
 from PIL import Image
-from pathlib import Path
-from typing import Dict, List, Tuple, Any, Optional, Union
 from ultralytics import YOLO
 # Constants
-MODEL_PATH = "yolov8n.pt"
-DEMO_VIDEOS_DIR = "demo_videos"
 FRAME_WIDTH = 640
 FRAME_HEIGHT = 480
-DEFAULT_THRESHOLD = 0.5
 class PeopleDetector:
     """
-    A class for detecting people in images using a pre-trained YOLOv8 model.
     """
     def __init__(
         self,
-        model_name: str = MODEL_PATH,
-        threshold: float = DEFAULT_THRESHOLD,
         device: Optional[str] = None,
     ):
         """
         Initialize the people detector with a pre-trained model.
         Args:
-            model_name: YOLOv8 model name to use
             threshold: Confidence threshold for detection (0.0 to 1.0)
             device: Device to run inference on (cuda/cpu). If None, will use cuda if available.
         """
@@ -62,7 +73,7 @@ class PeopleDetector:
         Detect people in an image.
         Args:
-            image: Input image as numpy array
         Returns:
             Tuple containing:
@@ -102,6 +113,125 @@ class PeopleDetector:
         inference_time = time.time() - start_time
         return detections, inference_time
 def draw_detections(
     image: np.ndarray,
@@ -168,10 +298,13 @@ def draw_detections(
     return annotated_image
 def add_performance_stats(
     image: np.ndarray,
     inference_time: float,
     people_count: int,
     bg_color: Tuple[int, int, int] = (0, 0, 0),
     text_color: Tuple[int, int, int] = (255, 255, 255),
     font_scale: float = 0.5,
@@ -182,8 +315,10 @@ def add_performance_stats(
     Args:
         image: Input image to add stats to
         inference_time: Model inference time in seconds
         people_count: Number of people detected
         bg_color: Background color for stats box
         text_color: Text color for stats
         font_scale: Font scale for text
@@ -195,20 +330,28 @@ def add_performance_stats(
     stats_image = image.copy()
     # Create stats text
-    people_text = f"People: {people_count}"
     inference_text = f"Inference: {inference_time*1000:.1f}ms"
     # Get text sizes
-    (people_width, people_height), _ = cv2.getTextSize(
-        people_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
     )
     (inf_width, inf_height), _ = cv2.getTextSize(
         inference_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
     )
     # Calculate background box dimensions
-    box_width = max(people_width, inf_width) + 20
-    box_height = people_height + inf_height + 20
     # Draw background box
     cv2.rectangle(
@@ -220,20 +363,44 @@ def add_performance_stats(
     )
     # Draw text
     cv2.putText(
         stats_image,
-        people_text,
-        (20, 10 + people_height + 5),
         cv2.FONT_HERSHEY_SIMPLEX,
         font_scale,
         text_color,
         thickness
     )
     cv2.putText(
         stats_image,
         inference_text,
-        (20, 10 + people_height + inf_height + 10),
         cv2.FONT_HERSHEY_SIMPLEX,
         font_scale,
         text_color,
@@ -242,207 +409,305 @@ def add_performance_stats(
     return stats_image
-# Initialize the detector
-detector = PeopleDetector(model_name=MODEL_PATH, threshold=DEFAULT_THRESHOLD)
-def process_image(image, threshold):
     """
-    Process an image with people detection.
-    Args:
-        image: Input image
-        threshold: Detection confidence threshold
-    Returns:
-        Annotated image with detections
     """
-    if image is None:
-        return None
-    # Update threshold if needed
-    if detector.threshold != threshold:
-        detector.threshold = threshold
-    # Convert to numpy array if needed
-    if isinstance(image, Image.Image):
-        image_array = np.array(image)
-        # Convert RGB to BGR (OpenCV format)
-        if len(image_array.shape) == 3 and image_array.shape[2] == 3:
-            image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
-    else:
-        image_array = image
-    # Run detection
-    detections, inference_time = detector.detect(image_array)
-    # Draw detections
-    annotated_image = draw_detections(image_array, detections)
-    # Add performance stats
-    annotated_image = add_performance_stats(
-        annotated_image,
-        inference_time,
-        len(detections)
-    )
-    # Convert back to RGB for display
-    if len(annotated_image.shape) == 3 and annotated_image.shape[2] == 3:
-        annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
-    return annotated_image
-def process_video(video_path, threshold):
-    """
-    Process a video with people detection.
-    Args:
-        video_path: Path to input video
-        threshold: Detection confidence threshold
-    Returns:
-        Path to output video with detections
-    """
-    if video_path is None:
-        return None
-    # Update threshold if needed
-    if detector.threshold != threshold:
-        detector.threshold = threshold
-    # Open the video
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        return None
-    # Get video properties
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Create output video path
-    output_path = f"output_{os.path.basename(video_path)}"
-    # Initialize video writer
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-    # Process each frame
-    frame_count = 0
-    while cap.isOpened():
-        ret, frame = cap.read()
-        if not ret:
-            break
-        # Run detection
-        detections, inference_time = detector.detect(frame)
-        # Draw detections
-        annotated_frame = draw_detections(frame, detections)
-        # Add performance stats
-        annotated_frame = add_performance_stats(
-            annotated_frame,
-            inference_time,
-            len(detections)
         )
-        # Write frame to output video
-        out.write(annotated_frame)
-        # Update progress
-        frame_count += 1
-        if frame_count % 10 == 0:
-            print(f"Processed {frame_count}/{total_frames} frames")
-    # Release resources
-    cap.release()
-    out.release()
-    return output_path
-def webcam_detection(image, threshold):
-    """
-    Process webcam frames with people detection.
-    Args:
-        image: Input image from webcam
-        threshold: Detection confidence threshold
-    Returns:
-        Annotated image with detections
-    """
-    return process_image(image, threshold)
-# Create Gradio interface
-with gr.Blocks(title="Real-time People Detection") as app:
-    gr.Markdown("# Real-time People Detection")
-    gr.Markdown("This application detects people in images and videos using YOLOv8.")
-    with gr.Tab("Image Detection"):
-        with gr.Row():
-            with gr.Column():
-                image_input = gr.Image(label="Input Image", type="pil")
-                image_threshold = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=DEFAULT_THRESHOLD,
-                    step=0.05,
-                    label="Detection Threshold"
-                )
-                image_button = gr.Button("Detect People")
-            with gr.Column():
-                image_output = gr.Image(label="Detection Result")
-        image_button.click(
-            process_image,
-            inputs=[image_input, image_threshold],
-            outputs=image_output
-        )
-    with gr.Tab("Video Detection"):
-        with gr.Row():
-            with gr.Column():
-                video_input = gr.Video(label="Input Video")
-                video_threshold = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=DEFAULT_THRESHOLD,
-                    step=0.05,
-                    label="Detection Threshold"
-                )
-                video_button = gr.Button("Process Video")
-            with gr.Column():
-                video_output = gr.Video(label="Detection Result")
-        video_button.click(
-            process_video,
-            inputs=[video_input, video_threshold],
-            outputs=video_output
         )
-    with gr.Tab("Webcam Detection"):
-        with gr.Row():
-            with gr.Column():
-                webcam_threshold = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=DEFAULT_THRESHOLD,
-                    step=0.05,
-                    label="Detection Threshold"
-                )
-        webcam = gr.Webcam(label="Webcam")
-        webcam_output = gr.Image(label="Detection Result")
-        webcam.change(
-            webcam_detection,
-            inputs=[webcam, webcam_threshold],
-            outputs=webcam_output
         )
-# Launch the app
 if __name__ == "__main__":
-    app.launch()

 """
+Real-time People Detection Streamlit application.
+This is the main entry point for the Hugging Face Space application.
 """
 import os
 import time
+from pathlib import Path
+from typing import Tuple, Dict, Any, Optional, List
 import cv2
 import numpy as np
+import streamlit as st
 from PIL import Image
+import torch
 from ultralytics import YOLO
 # Constants
+ASSETS_DIR = Path(__file__).parent / "assets"
+DEMO_VIDEOS = {
+    "One Person": ASSETS_DIR / "one-by-one-person-detection.mp4",
+    "Store Aisle": ASSETS_DIR / "store-aisle-detection.mp4",
+    "People Detection": ASSETS_DIR / "people-detection.mp4"
+}
 FRAME_WIDTH = 640
 FRAME_HEIGHT = 480
 class PeopleDetector:
     """
+    A class for detecting people in images using a pre-trained YOLOv8n model.
+    Attributes:
+        model_name: Name or path of the YOLOv8 model to use
+        threshold: Confidence threshold for detection
+        device: Device to run inference on (cuda/cpu)
+        model: The detection model
     """
     def __init__(
         self,
+        model_name: str = "yolov8n.pt",
+        threshold: float = 0.5,
         device: Optional[str] = None,
     ):
         """
         Initialize the people detector with a pre-trained model.
         Args:
+            model_name: YOLOv8 model name to use ('yolov8n.pt' is the smallest one)
             threshold: Confidence threshold for detection (0.0 to 1.0)
             device: Device to run inference on (cuda/cpu). If None, will use cuda if available.
         """
         Detect people in an image.
         Args:
+            image: Input image as numpy array (BGR format from OpenCV)
         Returns:
             Tuple containing:
         inference_time = time.time() - start_time
         return detections, inference_time
+    def update_threshold(self, threshold: float) -> None:
+        """
+        Update the detection confidence threshold.
+        Args:
+            threshold: New threshold value (0.0 to 1.0)
+        """
+        self.threshold = threshold
+class VideoSource:
+    """
+    A class for handling video input from different sources (webcam or file).
+    Attributes:
+        source: Camera index (int) or video file path (str)
+        width: Frame width to set (if possible)
+        height: Frame height to set (if possible)
+        fps_buffer_size: Number of frames to average for FPS calculation
+    """
+    def __init__(
+        self,
+        source: Any = 0,
+        width: int = 640,
+        height: int = 480,
+        fps_buffer_size: int = 30,
+    ):
+        """
+        Initialize the video source.
+        Args:
+            source: Camera index (int) or video file path (str)
+            width: Width to set for the captured frames
+            height: Height to set for the captured frames
+            fps_buffer_size: Number of frames to use for FPS averaging
+        """
+        self.source = source
+        self.width = width
+        self.height = height
+        self.fps_buffer_size = fps_buffer_size
+        self.cap = None
+        self.frame_times = []
+        self.is_running = False
+    def start(self) -> bool:
+        """
+        Start the video capture.
+        Returns:
+            bool: True if capture was started successfully, False otherwise
+        """
+        if self.is_running:
+            return True
+        self.cap = cv2.VideoCapture(self.source)
+        if not self.cap.isOpened():
+            return False
+        # Try to set properties if it's a webcam
+        if isinstance(self.source, int):
+            self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.width)
+            self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.height)
+        self.is_running = True
+        self.frame_times = []
+        return True
+    def stop(self) -> None:
+        """Stop the video capture and release resources."""
+        if self.is_running and self.cap is not None:
+            self.cap.release()
+            self.is_running = False
+    def read_frame(self) -> Tuple[bool, Optional[np.ndarray]]:
+        """
+        Read a single frame from the video source.
+        Returns:
+            Tuple containing:
+                - Boolean indicating if frame was successfully read
+                - Image as numpy array (or None if no frame was read)
+        """
+        if not self.is_running or self.cap is None:
+            return False, None
+        # Record time for FPS calculation
+        current_time = time.time()
+        # Read frame
+        ret, frame = self.cap.read()
+        if ret:
+            # Update FPS buffer
+            self.frame_times.append(current_time)
+            if len(self.frame_times) > self.fps_buffer_size:
+                self.frame_times.pop(0)
+        return ret, frame
+    def get_fps(self) -> float:
+        """
+        Calculate the current FPS based on actual frame timings.
+        Returns:
+            float: Current frames per second
+        """
+        if len(self.frame_times) < 2:
+            return 0.0
+        # Calculate FPS from time differences
+        time_diff = self.frame_times[-1] - self.frame_times[0]
+        if time_diff > 0:
+            return (len(self.frame_times) - 1) / time_diff
+        return 0.0
 def draw_detections(
     image: np.ndarray,
     return annotated_image
 def add_performance_stats(
     image: np.ndarray,
+    fps: float,
     inference_time: float,
     people_count: int,
+    inference_fps: float = 0.0,
     bg_color: Tuple[int, int, int] = (0, 0, 0),
     text_color: Tuple[int, int, int] = (255, 255, 255),
     font_scale: float = 0.5,
     Args:
         image: Input image to add stats to
+        fps: Current FPS value
         inference_time: Model inference time in seconds
         people_count: Number of people detected
+        inference_fps: Inference FPS (model predictions per second)
         bg_color: Background color for stats box
         text_color: Text color for stats
         font_scale: Font scale for text
     stats_image = image.copy()
     # Create stats text
+    fps_text = f"FPS: {fps:.1f}"
     inference_text = f"Inference: {inference_time*1000:.1f}ms"
+    count_text = f"People: {people_count}"
+    inf_fps_text = f"Inference FPS: {inference_fps:.1f}"
     # Get text sizes
+    (fps_width, fps_height), _ = cv2.getTextSize(
+        fps_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
     )
     (inf_width, inf_height), _ = cv2.getTextSize(
         inference_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
     )
+    (count_width, count_height), _ = cv2.getTextSize(
+        count_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+    )
+    (inf_fps_width, inf_fps_height), _ = cv2.getTextSize(
+        inf_fps_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
+    )
     # Calculate background box dimensions
+    box_width = max(fps_width, inf_width, count_width, inf_fps_width) + 20
+    box_height = fps_height + inf_height + count_height + inf_fps_height + 30
     # Draw background box
     cv2.rectangle(
     )
     # Draw text
+    y_offset = 10 + fps_height + 5
     cv2.putText(
         stats_image,
+        fps_text,
+        (20, y_offset),
         cv2.FONT_HERSHEY_SIMPLEX,
         font_scale,
         text_color,
         thickness
     )
+    y_offset += inf_height + 5
     cv2.putText(
         stats_image,
         inference_text,
+        (20, y_offset),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        text_color,
+        thickness
+    )
+    y_offset += count_height + 5
+    cv2.putText(
+        stats_image,
+        count_text,
+        (20, y_offset),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        text_color,
+        thickness
+    )
+    y_offset += inf_fps_height + 5
+    cv2.putText(
+        stats_image,
+        inf_fps_text,
+        (20, y_offset),
         cv2.FONT_HERSHEY_SIMPLEX,
         font_scale,
         text_color,
     return stats_image
+class PeopleDetectionApp:
     """
+    Streamlit application for real-time people detection.
+    This class handles the Streamlit UI components and orchestrates
+    the video capture and detection processes.
     """
+    def __init__(self):
+        """Initialize the Streamlit application components."""
+        # Set page config
+        st.set_page_config(
+            page_title="Real-time People Detection",
+            page_icon="👁️",
+            layout="wide",
+        )
+        # Initialize session state
+        if "video_source" not in st.session_state:
+            st.session_state.video_source = None
+        if "detector" not in st.session_state:
+            st.session_state.detector = None
+        if "is_running" not in st.session_state:
+            st.session_state.is_running = False
+        if "frame_placeholder" not in st.session_state:
+            st.session_state.frame_placeholder = None
+        if "last_inference_time" not in st.session_state:
+            st.session_state.last_inference_time = 0.0
+        if "last_inference_timestamp" not in st.session_state:
+            st.session_state.last_inference_timestamp = 0.0
+        if "frame_count" not in st.session_state:
+            st.session_state.frame_count = 0
+        if "last_frame" not in st.session_state:
+            st.session_state.last_frame = None
+        if "last_detections" not in st.session_state:
+            st.session_state.last_detections = []
+    def create_ui(self):
+        """Create the Streamlit UI components."""
+        # Page header
+        st.title("Real-time People Detection")
+        st.markdown(
+            "This application detects people in video streams using YOLOv8."
+        )
+        # Sidebar for controls
+        with st.sidebar:
+            st.header("Settings")
+            # Model selection
+            model_name = st.selectbox(
+                "Select detection model",
+                options=[
+                    "yolov8n.pt",  # Nano model (smallest)
+                ],
+                index=0,
+            )
+            # Detection threshold
+            detection_threshold = st.slider(
+                "Detection threshold",
+                min_value=0.1,
+                max_value=1.0,
+                value=0.5,
+                step=0.05,
+            )
+            # Target inference FPS
+            target_fps = st.slider(
+                "Target inference FPS",
+                min_value=1,
+                max_value=30,
+                value=10,
+                step=1,
+                help="Control how many frames per second are sent to the model for inference. Lower values use less resources but may appear less smooth."
+            )
+            # For Hugging Face Space, we only provide demo videos (no webcam)
+            source_type = "Demo Video"
+            # Let user select which demo video to use
+            demo_selection = st.selectbox(
+                "Select demo video",
+                options=list(DEMO_VIDEOS.keys()),
+                index=0,
+            )
+            video_path = str(DEMO_VIDEOS[demo_selection])
+            source = video_path
+            # Control buttons
+            col1, col2 = st.columns(2)
+            with col1:
+                start_button = st.button(
+                    "Start" if not st.session_state.is_running else "Restart",
+                    use_container_width=True,
+                )
+            with col2:
+                stop_button = st.button(
+                    "Stop",
+                    use_container_width=True,
+                    disabled=not st.session_state.is_running,
+                )
+        # Main area for video display
+        video_column, stats_column = st.columns([3, 1])
+        with video_column:
+            st.subheader("Detection Feed")
+            # Create a placeholder for the video frame
+            frame_placeholder = st.empty()
+            st.session_state.frame_placeholder = frame_placeholder
+        with stats_column:
+            st.subheader("Performance Stats")
+            # Create placeholders for stats
+            fps_text = st.empty()
+            inference_text = st.empty()
+            people_count = st.empty()
+            inference_fps_text = st.empty()
+        # Handle button actions
+        if start_button:
+            self.start_detection(source, model_name, detection_threshold, target_fps)
+        if stop_button:
+            self.stop_detection()
+        # Return stats placeholders for updating
+        return fps_text, inference_text, people_count, inference_fps_text
+    def start_detection(self, source, model_name, threshold, target_fps):
+        """
+        Start the detection process.
+        Args:
+            source: Video source (camera ID or file path)
+            model_name: YOLOv8 model to use
+            threshold: Detection confidence threshold
+            target_fps: Target frames per second for inference
+        """
+        # Stop existing detection if running
+        self.stop_detection()
+        # Initialize video source
+        video_source = VideoSource(
+            source=source,
+            width=FRAME_WIDTH,
+            height=FRAME_HEIGHT,
         )
+        # Initialize detector
+        detector = PeopleDetector(
+            model_name=model_name,
+            threshold=threshold,
+        )
+        # Start video capture
+        if not video_source.start():
+            st.error(f"Failed to open video source: {source}")
+            return
+        # Store objects in session state
+        st.session_state.video_source = video_source
+        st.session_state.detector = detector
+        st.session_state.is_running = True
+        st.session_state.target_fps = target_fps
+        st.session_state.last_inference_timestamp = time.time()
+        st.session_state.frame_count = 0
+        st.session_state.last_frame = None
+        st.session_state.last_detections = []
+    def stop_detection(self):
+        """Stop the detection process and release resources."""
+        if st.session_state.video_source is not None:
+            st.session_state.video_source.stop()
+            st.session_state.video_source = None
+        st.session_state.detector = None
+        st.session_state.is_running = False
+        st.session_state.last_frame = None
+        st.session_state.last_detections = []
+    def update_frame(self, fps_text, inference_text, people_count, inference_fps_text):
+        """
+        Update the video frame and stats.
+        Args:
+            fps_text: Streamlit element for FPS display
+            inference_text: Streamlit element for inference time display
+            people_count: Streamlit element for people count display
+            inference_fps_text: Streamlit element for inference FPS display
+        """
+        if not st.session_state.is_running:
+            return
+        video_source = st.session_state.video_source
+        detector = st.session_state.detector
+        target_fps = st.session_state.target_fps
+        if video_source is None or detector is None:
+            return
+        # Read a new frame
+        ret, frame = video_source.read_frame()
+        if not ret:
+            # If we've reached the end of a video file, restart it
+            if not isinstance(video_source.source, int):
+                # Restart video
+                video_source.stop()
+                if video_source.start():
+                    ret, frame = video_source.read_frame()
+                    if not ret:
+                        st.error("Failed to restart video")
+                        self.stop_detection()
+                        return
+                else:
+                    st.error("Failed to restart video source")
+                    self.stop_detection()
+                    return
+            else:
+                st.error("Failed to read frame from camera")
+                self.stop_detection()
+                return
+        # Calculate current FPS
+        fps = video_source.get_fps()
+        # Determine if we should run inference on this frame
+        current_time = time.time()
+        time_since_last_inference = current_time - st.session_state.last_inference_timestamp
+        inference_interval = 1.0 / target_fps
+        # Use cached detections or run new detection
+        detections = []
+        inference_time = 0
+        # Run a new detection if enough time has passed
+        if time_since_last_inference >= inference_interval:
+            detections, inference_time = detector.detect(frame)
+            # Update cache
+            st.session_state.last_frame = frame.copy()
+            st.session_state.last_detections = detections
+            st.session_state.last_inference_time = inference_time
+            st.session_state.last_inference_timestamp = current_time
+        else:
+            # Use cached detections
+            detections = st.session_state.last_detections
+            inference_time = st.session_state.last_inference_time
+        # Draw detections on the frame
+        frame_with_detections = draw_detections(frame, detections)
+        # Calculate inference FPS
+        if time_since_last_inference > 0:
+            inference_fps = 1.0 / time_since_last_inference
+        else:
+            inference_fps = 0.0
+        # Add performance stats to the frame
+        frame_with_stats = add_performance_stats(
+            frame_with_detections,
+            fps,
+            inference_time,
+            len(detections),
+            inference_fps
         )
+        # Display the frame
+        st.session_state.frame_placeholder.image(
+            frame_with_stats,
+            channels="BGR",
+            use_column_width=True
         )
+        # Update stats
+        fps_text.metric("FPS", f"{fps:.1f}")
+        inference_text.metric("Inference Time", f"{inference_time*1000:.1f} ms")
+        people_count.metric("People Detected", len(detections))
+        inference_fps_text.metric("Inference FPS", f"{inference_fps:.1f}")
+        # Increment frame counter
+        st.session_state.frame_count += 1
+def main():
+    """Main entry point for the application."""
+    app = PeopleDetectionApp()
+    fps_text, inference_text, people_count, inference_fps_text = app.create_ui()
+    # Infinite loop for updating the video frame
+    while st.session_state.is_running:
+        app.update_frame(fps_text, inference_text, people_count, inference_fps_text)
+        time.sleep(0.01)  # Small delay to prevent overloading the CPU
 if __name__ == "__main__":
+    main()