Spaces:

SkalskiP
/

RF-DETR

Running on T4

App Files Files Community

SkalskiP commited on 7 days ago

Commit

75be6c3

1 Parent(s): 1015457

test video processing on HF spaces

Browse files

Files changed (2) hide show

app.py +61 -21
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
-from typing import Union
 import gradio as gr
 import numpy as np
 import supervision as sv
@@ -9,22 +11,16 @@ from rfdetr.detr import RFDETR
 from rfdetr.util.coco_classes import COCO_CLASSES
 from utils.image import calculate_resolution_wh
-from utils.video import create_directory
 MARKDOWN = """
 # RF-DETR 🔥
-<div>
-  <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-rf-detr-on-detection-dataset.ipynb">
-    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="colab" style="display:inline-block;">
-  </a>
-  <a href="https://blog.roboflow.com/rf-detr">
-    <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="roboflow" style="display:inline-block;">
-  </a>
-  <a href="https://github.com/roboflow/rf-detr">
-    <img src="https://badges.aleen42.com/src/github.svg" alt="roboflow" style="display:inline-block;">
-  </a>
-</div>
 RF-DETR is a real-time, transformer-based object detection model architecture developed
 by [Roboflow](https://roboflow.com/) and released under the Apache 2.0 license.
@@ -41,12 +37,18 @@ COLOR = sv.ColorPalette.from_hex([
     "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
 ])
 VIDEO_SCALE_FACTOR = 0.5
 VIDEO_TARGET_DIRECTORY = "tmp"
 create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
-def detect_and_annotate(model: RFDETR, image: Union[Image.Image, np.ndarray], confidence: float):
     detections = model.predict(image, threshold=confidence)
     resolution_wh = calculate_resolution_wh(image)
@@ -73,16 +75,54 @@ def detect_and_annotate(model: RFDETR, image: Union[Image.Image, np.ndarray], co
     return annotated_image
-def image_processing_inference(input_image: Image.Image, confidence: float, resolution: int, checkpoint: str):
-    model_class = RFDETRBase if checkpoint == "base" else RFDETRLarge
-    model = model_class(resolution=resolution)
     return detect_and_annotate(model=model, image=input_image, confidence=confidence)
-def video_processing_inference(input_video: str, confidence: float, resolution: int, checkpoint: str):
-    model_class = RFDETRBase if checkpoint == "base" else RFDETRLarge
-    model = model_class(resolution=resolution)
-    return input_video
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)

+import os
+from typing import TypeVar
+from tqdm import tqdm
 import gradio as gr
 import numpy as np
 import supervision as sv
 from rfdetr.util.coco_classes import COCO_CLASSES
 from utils.image import calculate_resolution_wh
+from utils.video import create_directory, generate_unique_name
+ImageType = TypeVar("ImageType", Image.Image, np.ndarray)
 MARKDOWN = """
 # RF-DETR 🔥
+[`[code]`](https://github.com/roboflow/rf-detr)
+[`[blog]`](https://blog.roboflow.com/rf-detr)
+[`[notebook]`](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-rf-detr-on-detection-dataset.ipynb)
 RF-DETR is a real-time, transformer-based object detection model architecture developed
 by [Roboflow](https://roboflow.com/) and released under the Apache 2.0 license.
     "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
 ])
+MAX_VIDEO_LENGTH_SECONDS = 2
 VIDEO_SCALE_FACTOR = 0.5
 VIDEO_TARGET_DIRECTORY = "tmp"
 create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
+def detect_and_annotate(
+        model: RFDETR,
+        image: ImageType,
+        confidence: float
+) -> ImageType:
     detections = model.predict(image, threshold=confidence)
     resolution_wh = calculate_resolution_wh(image)
     return annotated_image
+def load_model(resolution: int, checkpoint: str) -> RFDETR:
+    if checkpoint == "base":
+        return RFDETRBase(resolution=resolution)
+    elif checkpoint == "large":
+        return RFDETRLarge(resolution=resolution)
+    raise TypeError("Checkpoint must be a base or large.")
+def image_processing_inference(
+        input_image: Image.Image,
+        confidence: float,
+        resolution: int,
+        checkpoint: str
+):
+    model = load_model(resolution=resolution, checkpoint=checkpoint)
     return detect_and_annotate(model=model, image=input_image, confidence=confidence)
+def video_processing_inference(
+        input_video: str,
+        confidence: float,
+        resolution: int,
+        checkpoint: str,
+        progress=gr.Progress(track_tqdm=True)
+):
+    model = load_model(resolution=resolution, checkpoint=checkpoint)
+    name = generate_unique_name()
+    output_video = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
+    video_info = sv.VideoInfo.from_video_path(input_video)
+    video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
+    video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)
+    total = min(video_info.total_frames, video_info.fps * MAX_VIDEO_LENGTH_SECONDS)
+    frames_generator = sv.get_video_frames_generator(input_video, end=total)
+    with sv.VideoSink(output_video, video_info=video_info) as sink:
+        for frame in tqdm(frames_generator, total=total):
+            frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
+            annotated_frame = detect_and_annotate(
+                model=model,
+                image=frame,
+                confidence=confidence
+            )
+            sink.write_frame(annotated_frame)
+    return output_video
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio
 spaces
-rfdetr

 gradio
 spaces
+rfdetr
+tqdm