ekabaruh commited on
Commit
3643479
·
verified ·
1 Parent(s): 1d5e702

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +468 -203
app.py CHANGED
@@ -1,44 +1,55 @@
1
  """
2
- Real-time People Detection App for Hugging Face Space
3
 
4
- This application detects people in images and videos using YOLOv8 from Ultralytics.
5
- The app provides an interface for uploading images or using webcam for real-time detection.
6
  """
7
 
8
  import os
9
  import time
 
 
 
10
  import cv2
11
  import numpy as np
12
- import gradio as gr
13
- import torch
14
  from PIL import Image
15
- from pathlib import Path
16
- from typing import Dict, List, Tuple, Any, Optional, Union
17
  from ultralytics import YOLO
18
 
 
19
  # Constants
20
- MODEL_PATH = "yolov8n.pt"
21
- DEMO_VIDEOS_DIR = "demo_videos"
 
 
 
 
22
  FRAME_WIDTH = 640
23
  FRAME_HEIGHT = 480
24
- DEFAULT_THRESHOLD = 0.5
25
 
26
  class PeopleDetector:
27
  """
28
- A class for detecting people in images using a pre-trained YOLOv8 model.
 
 
 
 
 
 
29
  """
30
 
31
  def __init__(
32
  self,
33
- model_name: str = MODEL_PATH,
34
- threshold: float = DEFAULT_THRESHOLD,
35
  device: Optional[str] = None,
36
  ):
37
  """
38
  Initialize the people detector with a pre-trained model.
39
 
40
  Args:
41
- model_name: YOLOv8 model name to use
42
  threshold: Confidence threshold for detection (0.0 to 1.0)
43
  device: Device to run inference on (cuda/cpu). If None, will use cuda if available.
44
  """
@@ -62,7 +73,7 @@ class PeopleDetector:
62
  Detect people in an image.
63
 
64
  Args:
65
- image: Input image as numpy array
66
 
67
  Returns:
68
  Tuple containing:
@@ -102,6 +113,125 @@ class PeopleDetector:
102
  inference_time = time.time() - start_time
103
 
104
  return detections, inference_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  def draw_detections(
107
  image: np.ndarray,
@@ -168,10 +298,13 @@ def draw_detections(
168
 
169
  return annotated_image
170
 
 
171
  def add_performance_stats(
172
  image: np.ndarray,
 
173
  inference_time: float,
174
  people_count: int,
 
175
  bg_color: Tuple[int, int, int] = (0, 0, 0),
176
  text_color: Tuple[int, int, int] = (255, 255, 255),
177
  font_scale: float = 0.5,
@@ -182,8 +315,10 @@ def add_performance_stats(
182
 
183
  Args:
184
  image: Input image to add stats to
 
185
  inference_time: Model inference time in seconds
186
  people_count: Number of people detected
 
187
  bg_color: Background color for stats box
188
  text_color: Text color for stats
189
  font_scale: Font scale for text
@@ -195,20 +330,28 @@ def add_performance_stats(
195
  stats_image = image.copy()
196
 
197
  # Create stats text
198
- people_text = f"People: {people_count}"
199
  inference_text = f"Inference: {inference_time*1000:.1f}ms"
 
 
200
 
201
  # Get text sizes
202
- (people_width, people_height), _ = cv2.getTextSize(
203
- people_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
204
  )
205
  (inf_width, inf_height), _ = cv2.getTextSize(
206
  inference_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
207
  )
 
 
 
 
 
 
208
 
209
  # Calculate background box dimensions
210
- box_width = max(people_width, inf_width) + 20
211
- box_height = people_height + inf_height + 20
212
 
213
  # Draw background box
214
  cv2.rectangle(
@@ -220,20 +363,44 @@ def add_performance_stats(
220
  )
221
 
222
  # Draw text
 
223
  cv2.putText(
224
  stats_image,
225
- people_text,
226
- (20, 10 + people_height + 5),
227
  cv2.FONT_HERSHEY_SIMPLEX,
228
  font_scale,
229
  text_color,
230
  thickness
231
  )
232
 
 
233
  cv2.putText(
234
  stats_image,
235
  inference_text,
236
- (20, 10 + people_height + inf_height + 10),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  cv2.FONT_HERSHEY_SIMPLEX,
238
  font_scale,
239
  text_color,
@@ -242,207 +409,305 @@ def add_performance_stats(
242
 
243
  return stats_image
244
 
245
- # Initialize the detector
246
- detector = PeopleDetector(model_name=MODEL_PATH, threshold=DEFAULT_THRESHOLD)
247
 
248
- def process_image(image, threshold):
249
  """
250
- Process an image with people detection.
251
 
252
- Args:
253
- image: Input image
254
- threshold: Detection confidence threshold
255
-
256
- Returns:
257
- Annotated image with detections
258
  """
259
- if image is None:
260
- return None
261
-
262
- # Update threshold if needed
263
- if detector.threshold != threshold:
264
- detector.threshold = threshold
265
-
266
- # Convert to numpy array if needed
267
- if isinstance(image, Image.Image):
268
- image_array = np.array(image)
269
- # Convert RGB to BGR (OpenCV format)
270
- if len(image_array.shape) == 3 and image_array.shape[2] == 3:
271
- image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
272
- else:
273
- image_array = image
274
-
275
- # Run detection
276
- detections, inference_time = detector.detect(image_array)
277
 
278
- # Draw detections
279
- annotated_image = draw_detections(image_array, detections)
280
-
281
- # Add performance stats
282
- annotated_image = add_performance_stats(
283
- annotated_image,
284
- inference_time,
285
- len(detections)
286
- )
287
-
288
- # Convert back to RGB for display
289
- if len(annotated_image.shape) == 3 and annotated_image.shape[2] == 3:
290
- annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
291
-
292
- return annotated_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- def process_video(video_path, threshold):
295
- """
296
- Process a video with people detection.
297
-
298
- Args:
299
- video_path: Path to input video
300
- threshold: Detection confidence threshold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
- Returns:
303
- Path to output video with detections
304
- """
305
- if video_path is None:
306
- return None
307
-
308
- # Update threshold if needed
309
- if detector.threshold != threshold:
310
- detector.threshold = threshold
311
-
312
- # Open the video
313
- cap = cv2.VideoCapture(video_path)
314
- if not cap.isOpened():
315
- return None
316
-
317
- # Get video properties
318
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
319
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
320
- fps = cap.get(cv2.CAP_PROP_FPS)
321
- total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
322
-
323
- # Create output video path
324
- output_path = f"output_{os.path.basename(video_path)}"
325
-
326
- # Initialize video writer
327
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
328
- out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
329
-
330
- # Process each frame
331
- frame_count = 0
332
- while cap.isOpened():
333
- ret, frame = cap.read()
334
- if not ret:
335
- break
336
 
337
- # Run detection
338
- detections, inference_time = detector.detect(frame)
 
 
 
 
339
 
340
- # Draw detections
341
- annotated_frame = draw_detections(frame, detections)
 
 
 
 
 
 
342
 
343
- # Add performance stats
344
- annotated_frame = add_performance_stats(
345
- annotated_frame,
346
- inference_time,
347
- len(detections)
348
  )
349
 
350
- # Write frame to output video
351
- out.write(annotated_frame)
 
 
 
352
 
353
- # Update progress
354
- frame_count += 1
355
- if frame_count % 10 == 0:
356
- print(f"Processed {frame_count}/{total_frames} frames")
357
-
358
- # Release resources
359
- cap.release()
360
- out.release()
361
-
362
- return output_path
363
-
364
- def webcam_detection(image, threshold):
365
- """
366
- Process webcam frames with people detection.
367
-
368
- Args:
369
- image: Input image from webcam
370
- threshold: Detection confidence threshold
371
 
372
- Returns:
373
- Annotated image with detections
374
- """
375
- return process_image(image, threshold)
376
-
377
- # Create Gradio interface
378
- with gr.Blocks(title="Real-time People Detection") as app:
379
- gr.Markdown("# Real-time People Detection")
380
- gr.Markdown("This application detects people in images and videos using YOLOv8.")
 
 
 
 
 
 
 
 
 
 
 
381
 
382
- with gr.Tab("Image Detection"):
383
- with gr.Row():
384
- with gr.Column():
385
- image_input = gr.Image(label="Input Image", type="pil")
386
- image_threshold = gr.Slider(
387
- minimum=0.1,
388
- maximum=1.0,
389
- value=DEFAULT_THRESHOLD,
390
- step=0.05,
391
- label="Detection Threshold"
392
- )
393
- image_button = gr.Button("Detect People")
 
 
 
 
 
 
 
394
 
395
- with gr.Column():
396
- image_output = gr.Image(label="Detection Result")
397
 
398
- image_button.click(
399
- process_image,
400
- inputs=[image_input, image_threshold],
401
- outputs=image_output
402
- )
403
-
404
- with gr.Tab("Video Detection"):
405
- with gr.Row():
406
- with gr.Column():
407
- video_input = gr.Video(label="Input Video")
408
- video_threshold = gr.Slider(
409
- minimum=0.1,
410
- maximum=1.0,
411
- value=DEFAULT_THRESHOLD,
412
- step=0.05,
413
- label="Detection Threshold"
414
- )
415
- video_button = gr.Button("Process Video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
- with gr.Column():
418
- video_output = gr.Video(label="Detection Result")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
- video_button.click(
421
- process_video,
422
- inputs=[video_input, video_threshold],
423
- outputs=video_output
 
 
 
424
  )
425
-
426
- with gr.Tab("Webcam Detection"):
427
- with gr.Row():
428
- with gr.Column():
429
- webcam_threshold = gr.Slider(
430
- minimum=0.1,
431
- maximum=1.0,
432
- value=DEFAULT_THRESHOLD,
433
- step=0.05,
434
- label="Detection Threshold"
435
- )
436
-
437
- webcam = gr.Webcam(label="Webcam")
438
- webcam_output = gr.Image(label="Detection Result")
439
 
440
- webcam.change(
441
- webcam_detection,
442
- inputs=[webcam, webcam_threshold],
443
- outputs=webcam_output
 
444
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- # Launch the app
447
  if __name__ == "__main__":
448
- app.launch()
 
1
  """
2
+ Real-time People Detection Streamlit application.
3
 
4
+ This is the main entry point for the Hugging Face Space application.
 
5
  """
6
 
7
  import os
8
  import time
9
+ from pathlib import Path
10
+ from typing import Tuple, Dict, Any, Optional, List
11
+
12
  import cv2
13
  import numpy as np
14
+ import streamlit as st
 
15
  from PIL import Image
16
+ import torch
 
17
  from ultralytics import YOLO
18
 
19
+
20
  # Constants
21
+ ASSETS_DIR = Path(__file__).parent / "assets"
22
+ DEMO_VIDEOS = {
23
+ "One Person": ASSETS_DIR / "one-by-one-person-detection.mp4",
24
+ "Store Aisle": ASSETS_DIR / "store-aisle-detection.mp4",
25
+ "People Detection": ASSETS_DIR / "people-detection.mp4"
26
+ }
27
  FRAME_WIDTH = 640
28
  FRAME_HEIGHT = 480
29
+
30
 
31
  class PeopleDetector:
32
  """
33
+ A class for detecting people in images using a pre-trained YOLOv8n model.
34
+
35
+ Attributes:
36
+ model_name: Name or path of the YOLOv8 model to use
37
+ threshold: Confidence threshold for detection
38
+ device: Device to run inference on (cuda/cpu)
39
+ model: The detection model
40
  """
41
 
42
  def __init__(
43
  self,
44
+ model_name: str = "yolov8n.pt",
45
+ threshold: float = 0.5,
46
  device: Optional[str] = None,
47
  ):
48
  """
49
  Initialize the people detector with a pre-trained model.
50
 
51
  Args:
52
+ model_name: YOLOv8 model name to use ('yolov8n.pt' is the smallest one)
53
  threshold: Confidence threshold for detection (0.0 to 1.0)
54
  device: Device to run inference on (cuda/cpu). If None, will use cuda if available.
55
  """
 
73
  Detect people in an image.
74
 
75
  Args:
76
+ image: Input image as numpy array (BGR format from OpenCV)
77
 
78
  Returns:
79
  Tuple containing:
 
113
  inference_time = time.time() - start_time
114
 
115
  return detections, inference_time
116
+
117
+ def update_threshold(self, threshold: float) -> None:
118
+ """
119
+ Update the detection confidence threshold.
120
+
121
+ Args:
122
+ threshold: New threshold value (0.0 to 1.0)
123
+ """
124
+ self.threshold = threshold
125
+
126
+
127
+ class VideoSource:
128
+ """
129
+ A class for handling video input from different sources (webcam or file).
130
+
131
+ Attributes:
132
+ source: Camera index (int) or video file path (str)
133
+ width: Frame width to set (if possible)
134
+ height: Frame height to set (if possible)
135
+ fps_buffer_size: Number of frames to average for FPS calculation
136
+ """
137
+
138
+ def __init__(
139
+ self,
140
+ source: Any = 0,
141
+ width: int = 640,
142
+ height: int = 480,
143
+ fps_buffer_size: int = 30,
144
+ ):
145
+ """
146
+ Initialize the video source.
147
+
148
+ Args:
149
+ source: Camera index (int) or video file path (str)
150
+ width: Width to set for the captured frames
151
+ height: Height to set for the captured frames
152
+ fps_buffer_size: Number of frames to use for FPS averaging
153
+ """
154
+ self.source = source
155
+ self.width = width
156
+ self.height = height
157
+ self.fps_buffer_size = fps_buffer_size
158
+
159
+ self.cap = None
160
+ self.frame_times = []
161
+ self.is_running = False
162
+
163
+ def start(self) -> bool:
164
+ """
165
+ Start the video capture.
166
+
167
+ Returns:
168
+ bool: True if capture was started successfully, False otherwise
169
+ """
170
+ if self.is_running:
171
+ return True
172
+
173
+ self.cap = cv2.VideoCapture(self.source)
174
+
175
+ if not self.cap.isOpened():
176
+ return False
177
+
178
+ # Try to set properties if it's a webcam
179
+ if isinstance(self.source, int):
180
+ self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.width)
181
+ self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.height)
182
+
183
+ self.is_running = True
184
+ self.frame_times = []
185
+ return True
186
+
187
+ def stop(self) -> None:
188
+ """Stop the video capture and release resources."""
189
+ if self.is_running and self.cap is not None:
190
+ self.cap.release()
191
+ self.is_running = False
192
+
193
+ def read_frame(self) -> Tuple[bool, Optional[np.ndarray]]:
194
+ """
195
+ Read a single frame from the video source.
196
+
197
+ Returns:
198
+ Tuple containing:
199
+ - Boolean indicating if frame was successfully read
200
+ - Image as numpy array (or None if no frame was read)
201
+ """
202
+ if not self.is_running or self.cap is None:
203
+ return False, None
204
+
205
+ # Record time for FPS calculation
206
+ current_time = time.time()
207
+
208
+ # Read frame
209
+ ret, frame = self.cap.read()
210
+
211
+ if ret:
212
+ # Update FPS buffer
213
+ self.frame_times.append(current_time)
214
+ if len(self.frame_times) > self.fps_buffer_size:
215
+ self.frame_times.pop(0)
216
+
217
+ return ret, frame
218
+
219
+ def get_fps(self) -> float:
220
+ """
221
+ Calculate the current FPS based on actual frame timings.
222
+
223
+ Returns:
224
+ float: Current frames per second
225
+ """
226
+ if len(self.frame_times) < 2:
227
+ return 0.0
228
+
229
+ # Calculate FPS from time differences
230
+ time_diff = self.frame_times[-1] - self.frame_times[0]
231
+ if time_diff > 0:
232
+ return (len(self.frame_times) - 1) / time_diff
233
+ return 0.0
234
+
235
 
236
  def draw_detections(
237
  image: np.ndarray,
 
298
 
299
  return annotated_image
300
 
301
+
302
  def add_performance_stats(
303
  image: np.ndarray,
304
+ fps: float,
305
  inference_time: float,
306
  people_count: int,
307
+ inference_fps: float = 0.0,
308
  bg_color: Tuple[int, int, int] = (0, 0, 0),
309
  text_color: Tuple[int, int, int] = (255, 255, 255),
310
  font_scale: float = 0.5,
 
315
 
316
  Args:
317
  image: Input image to add stats to
318
+ fps: Current FPS value
319
  inference_time: Model inference time in seconds
320
  people_count: Number of people detected
321
+ inference_fps: Inference FPS (model predictions per second)
322
  bg_color: Background color for stats box
323
  text_color: Text color for stats
324
  font_scale: Font scale for text
 
330
  stats_image = image.copy()
331
 
332
  # Create stats text
333
+ fps_text = f"FPS: {fps:.1f}"
334
  inference_text = f"Inference: {inference_time*1000:.1f}ms"
335
+ count_text = f"People: {people_count}"
336
+ inf_fps_text = f"Inference FPS: {inference_fps:.1f}"
337
 
338
  # Get text sizes
339
+ (fps_width, fps_height), _ = cv2.getTextSize(
340
+ fps_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
341
  )
342
  (inf_width, inf_height), _ = cv2.getTextSize(
343
  inference_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
344
  )
345
+ (count_width, count_height), _ = cv2.getTextSize(
346
+ count_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
347
+ )
348
+ (inf_fps_width, inf_fps_height), _ = cv2.getTextSize(
349
+ inf_fps_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness
350
+ )
351
 
352
  # Calculate background box dimensions
353
+ box_width = max(fps_width, inf_width, count_width, inf_fps_width) + 20
354
+ box_height = fps_height + inf_height + count_height + inf_fps_height + 30
355
 
356
  # Draw background box
357
  cv2.rectangle(
 
363
  )
364
 
365
  # Draw text
366
+ y_offset = 10 + fps_height + 5
367
  cv2.putText(
368
  stats_image,
369
+ fps_text,
370
+ (20, y_offset),
371
  cv2.FONT_HERSHEY_SIMPLEX,
372
  font_scale,
373
  text_color,
374
  thickness
375
  )
376
 
377
+ y_offset += inf_height + 5
378
  cv2.putText(
379
  stats_image,
380
  inference_text,
381
+ (20, y_offset),
382
+ cv2.FONT_HERSHEY_SIMPLEX,
383
+ font_scale,
384
+ text_color,
385
+ thickness
386
+ )
387
+
388
+ y_offset += count_height + 5
389
+ cv2.putText(
390
+ stats_image,
391
+ count_text,
392
+ (20, y_offset),
393
+ cv2.FONT_HERSHEY_SIMPLEX,
394
+ font_scale,
395
+ text_color,
396
+ thickness
397
+ )
398
+
399
+ y_offset += inf_fps_height + 5
400
+ cv2.putText(
401
+ stats_image,
402
+ inf_fps_text,
403
+ (20, y_offset),
404
  cv2.FONT_HERSHEY_SIMPLEX,
405
  font_scale,
406
  text_color,
 
409
 
410
  return stats_image
411
 
 
 
412
 
413
+ class PeopleDetectionApp:
414
  """
415
+ Streamlit application for real-time people detection.
416
 
417
+ This class handles the Streamlit UI components and orchestrates
418
+ the video capture and detection processes.
 
 
 
 
419
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
+ def __init__(self):
422
+ """Initialize the Streamlit application components."""
423
+ # Set page config
424
+ st.set_page_config(
425
+ page_title="Real-time People Detection",
426
+ page_icon="👁️",
427
+ layout="wide",
428
+ )
429
+
430
+ # Initialize session state
431
+ if "video_source" not in st.session_state:
432
+ st.session_state.video_source = None
433
+ if "detector" not in st.session_state:
434
+ st.session_state.detector = None
435
+ if "is_running" not in st.session_state:
436
+ st.session_state.is_running = False
437
+ if "frame_placeholder" not in st.session_state:
438
+ st.session_state.frame_placeholder = None
439
+ if "last_inference_time" not in st.session_state:
440
+ st.session_state.last_inference_time = 0.0
441
+ if "last_inference_timestamp" not in st.session_state:
442
+ st.session_state.last_inference_timestamp = 0.0
443
+ if "frame_count" not in st.session_state:
444
+ st.session_state.frame_count = 0
445
+ if "last_frame" not in st.session_state:
446
+ st.session_state.last_frame = None
447
+ if "last_detections" not in st.session_state:
448
+ st.session_state.last_detections = []
449
+
450
+ def create_ui(self):
451
+ """Create the Streamlit UI components."""
452
+ # Page header
453
+ st.title("Real-time People Detection")
454
+ st.markdown(
455
+ "This application detects people in video streams using YOLOv8."
456
+ )
457
+
458
+ # Sidebar for controls
459
+ with st.sidebar:
460
+ st.header("Settings")
461
+
462
+ # Model selection
463
+ model_name = st.selectbox(
464
+ "Select detection model",
465
+ options=[
466
+ "yolov8n.pt", # Nano model (smallest)
467
+ ],
468
+ index=0,
469
+ )
470
+
471
+ # Detection threshold
472
+ detection_threshold = st.slider(
473
+ "Detection threshold",
474
+ min_value=0.1,
475
+ max_value=1.0,
476
+ value=0.5,
477
+ step=0.05,
478
+ )
479
 
480
+ # Target inference FPS
481
+ target_fps = st.slider(
482
+ "Target inference FPS",
483
+ min_value=1,
484
+ max_value=30,
485
+ value=10,
486
+ step=1,
487
+ help="Control how many frames per second are sent to the model for inference. Lower values use less resources but may appear less smooth."
488
+ )
489
+
490
+ # For Hugging Face Space, we only provide demo videos (no webcam)
491
+ source_type = "Demo Video"
492
+
493
+ # Let user select which demo video to use
494
+ demo_selection = st.selectbox(
495
+ "Select demo video",
496
+ options=list(DEMO_VIDEOS.keys()),
497
+ index=0,
498
+ )
499
+ video_path = str(DEMO_VIDEOS[demo_selection])
500
+ source = video_path
501
+
502
+ # Control buttons
503
+ col1, col2 = st.columns(2)
504
+
505
+ with col1:
506
+ start_button = st.button(
507
+ "Start" if not st.session_state.is_running else "Restart",
508
+ use_container_width=True,
509
+ )
510
+
511
+ with col2:
512
+ stop_button = st.button(
513
+ "Stop",
514
+ use_container_width=True,
515
+ disabled=not st.session_state.is_running,
516
+ )
517
 
518
+ # Main area for video display
519
+ video_column, stats_column = st.columns([3, 1])
520
+
521
+ with video_column:
522
+ st.subheader("Detection Feed")
523
+ # Create a placeholder for the video frame
524
+ frame_placeholder = st.empty()
525
+ st.session_state.frame_placeholder = frame_placeholder
526
+
527
+ with stats_column:
528
+ st.subheader("Performance Stats")
529
+ # Create placeholders for stats
530
+ fps_text = st.empty()
531
+ inference_text = st.empty()
532
+ people_count = st.empty()
533
+ inference_fps_text = st.empty()
534
+
535
+ # Handle button actions
536
+ if start_button:
537
+ self.start_detection(source, model_name, detection_threshold, target_fps)
538
+
539
+ if stop_button:
540
+ self.stop_detection()
 
 
 
 
 
 
 
 
 
 
 
541
 
542
+ # Return stats placeholders for updating
543
+ return fps_text, inference_text, people_count, inference_fps_text
544
+
545
+ def start_detection(self, source, model_name, threshold, target_fps):
546
+ """
547
+ Start the detection process.
548
 
549
+ Args:
550
+ source: Video source (camera ID or file path)
551
+ model_name: YOLOv8 model to use
552
+ threshold: Detection confidence threshold
553
+ target_fps: Target frames per second for inference
554
+ """
555
+ # Stop existing detection if running
556
+ self.stop_detection()
557
 
558
+ # Initialize video source
559
+ video_source = VideoSource(
560
+ source=source,
561
+ width=FRAME_WIDTH,
562
+ height=FRAME_HEIGHT,
563
  )
564
 
565
+ # Initialize detector
566
+ detector = PeopleDetector(
567
+ model_name=model_name,
568
+ threshold=threshold,
569
+ )
570
 
571
+ # Start video capture
572
+ if not video_source.start():
573
+ st.error(f"Failed to open video source: {source}")
574
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
+ # Store objects in session state
577
+ st.session_state.video_source = video_source
578
+ st.session_state.detector = detector
579
+ st.session_state.is_running = True
580
+ st.session_state.target_fps = target_fps
581
+ st.session_state.last_inference_timestamp = time.time()
582
+ st.session_state.frame_count = 0
583
+ st.session_state.last_frame = None
584
+ st.session_state.last_detections = []
585
+
586
+ def stop_detection(self):
587
+ """Stop the detection process and release resources."""
588
+ if st.session_state.video_source is not None:
589
+ st.session_state.video_source.stop()
590
+ st.session_state.video_source = None
591
+
592
+ st.session_state.detector = None
593
+ st.session_state.is_running = False
594
+ st.session_state.last_frame = None
595
+ st.session_state.last_detections = []
596
 
597
+ def update_frame(self, fps_text, inference_text, people_count, inference_fps_text):
598
+ """
599
+ Update the video frame and stats.
600
+
601
+ Args:
602
+ fps_text: Streamlit element for FPS display
603
+ inference_text: Streamlit element for inference time display
604
+ people_count: Streamlit element for people count display
605
+ inference_fps_text: Streamlit element for inference FPS display
606
+ """
607
+ if not st.session_state.is_running:
608
+ return
609
+
610
+ video_source = st.session_state.video_source
611
+ detector = st.session_state.detector
612
+ target_fps = st.session_state.target_fps
613
+
614
+ if video_source is None or detector is None:
615
+ return
616
 
617
+ # Read a new frame
618
+ ret, frame = video_source.read_frame()
619
 
620
+ if not ret:
621
+ # If we've reached the end of a video file, restart it
622
+ if not isinstance(video_source.source, int):
623
+ # Restart video
624
+ video_source.stop()
625
+ if video_source.start():
626
+ ret, frame = video_source.read_frame()
627
+ if not ret:
628
+ st.error("Failed to restart video")
629
+ self.stop_detection()
630
+ return
631
+ else:
632
+ st.error("Failed to restart video source")
633
+ self.stop_detection()
634
+ return
635
+ else:
636
+ st.error("Failed to read frame from camera")
637
+ self.stop_detection()
638
+ return
639
+
640
+ # Calculate current FPS
641
+ fps = video_source.get_fps()
642
+
643
+ # Determine if we should run inference on this frame
644
+ current_time = time.time()
645
+ time_since_last_inference = current_time - st.session_state.last_inference_timestamp
646
+ inference_interval = 1.0 / target_fps
647
+
648
+ # Use cached detections or run new detection
649
+ detections = []
650
+ inference_time = 0
651
+
652
+ # Run a new detection if enough time has passed
653
+ if time_since_last_inference >= inference_interval:
654
+ detections, inference_time = detector.detect(frame)
655
 
656
+ # Update cache
657
+ st.session_state.last_frame = frame.copy()
658
+ st.session_state.last_detections = detections
659
+ st.session_state.last_inference_time = inference_time
660
+ st.session_state.last_inference_timestamp = current_time
661
+ else:
662
+ # Use cached detections
663
+ detections = st.session_state.last_detections
664
+ inference_time = st.session_state.last_inference_time
665
+
666
+ # Draw detections on the frame
667
+ frame_with_detections = draw_detections(frame, detections)
668
+
669
+ # Calculate inference FPS
670
+ if time_since_last_inference > 0:
671
+ inference_fps = 1.0 / time_since_last_inference
672
+ else:
673
+ inference_fps = 0.0
674
 
675
+ # Add performance stats to the frame
676
+ frame_with_stats = add_performance_stats(
677
+ frame_with_detections,
678
+ fps,
679
+ inference_time,
680
+ len(detections),
681
+ inference_fps
682
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
+ # Display the frame
685
+ st.session_state.frame_placeholder.image(
686
+ frame_with_stats,
687
+ channels="BGR",
688
+ use_column_width=True
689
  )
690
+
691
+ # Update stats
692
+ fps_text.metric("FPS", f"{fps:.1f}")
693
+ inference_text.metric("Inference Time", f"{inference_time*1000:.1f} ms")
694
+ people_count.metric("People Detected", len(detections))
695
+ inference_fps_text.metric("Inference FPS", f"{inference_fps:.1f}")
696
+
697
+ # Increment frame counter
698
+ st.session_state.frame_count += 1
699
+
700
+
701
+ def main():
702
+ """Main entry point for the application."""
703
+ app = PeopleDetectionApp()
704
+ fps_text, inference_text, people_count, inference_fps_text = app.create_ui()
705
+
706
+ # Infinite loop for updating the video frame
707
+ while st.session_state.is_running:
708
+ app.update_frame(fps_text, inference_text, people_count, inference_fps_text)
709
+ time.sleep(0.01) # Small delay to prevent overloading the CPU
710
+
711
 
 
712
  if __name__ == "__main__":
713
+ main()