Spaces:

Mountchicken
/

Rex-Thinker-Demo

Running on Zero

App Files Files Community

Mountchicken commited on 7 days ago

Commit

f6a0151

verified ·

1 Parent(s): 54888ff

Update groundingdino/util/inference.py

Browse files

Files changed (1) hide show

groundingdino/util/inference.py +21 -58

groundingdino/util/inference.py CHANGED Viewed

@@ -5,7 +5,6 @@ import numpy as np
 import supervision as sv
 import torch
 from PIL import Image
-from torchvision.ops import box_convert
 import bisect
 import groundingdino.datasets.transforms as T
@@ -14,6 +13,19 @@ from groundingdino.util.misc import clean_state_dict
 from groundingdino.util.slconfig import SLConfig
 from groundingdino.util.utils import get_phrases_from_posmap
 # ----------------------------------------------------------------------------------------------------------------------
 # OLD API
 # ----------------------------------------------------------------------------------------------------------------------
@@ -67,16 +79,16 @@ def predict(
     with torch.no_grad():
         outputs = model(image[None], captions=[caption])
-    prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0]  # prediction_logits.shape = (nq, 256)
-    prediction_boxes = outputs["pred_boxes"].cpu()[0]  # prediction_boxes.shape = (nq, 4)
     mask = prediction_logits.max(dim=1)[0] > box_threshold
-    logits = prediction_logits[mask]  # logits.shape = (n, 256)
-    boxes = prediction_boxes[mask]  # boxes.shape = (n, 4)
     tokenizer = model.tokenizer
     tokenized = tokenizer(caption)
     if remove_combined:
         sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
@@ -98,21 +110,9 @@ def predict(
 def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
-    """
-    This function annotates an image with bounding boxes and labels.
-    Parameters:
-    image_source (np.ndarray): The source image to be annotated.
-    boxes (torch.Tensor): A tensor containing bounding box coordinates.
-    logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
-    phrases (List[str]): A list of labels for each bounding box.
-    Returns:
-    np.ndarray: The annotated image.
-    """
     h, w, _ = image_source.shape
     boxes = boxes * torch.Tensor([w, h, w, h])
-    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
     detections = sv.Detections(xyxy=xyxy)
     labels = [
@@ -156,24 +156,6 @@ class Model:
         box_threshold: float = 0.35,
         text_threshold: float = 0.25
     ) -> Tuple[sv.Detections, List[str]]:
-        """
-        import cv2
-        image = cv2.imread(IMAGE_PATH)
-        model = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)
-        detections, labels = model.predict_with_caption(
-            image=image,
-            caption=caption,
-            box_threshold=BOX_THRESHOLD,
-            text_threshold=TEXT_THRESHOLD
-        )
-        import supervision as sv
-        box_annotator = sv.BoxAnnotator()
-        annotated_image = box_annotator.annotate(scene=image, detections=detections, labels=labels)
-        """
         processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
         boxes, logits, phrases = predict(
             model=self.model,
@@ -197,25 +179,6 @@ class Model:
         box_threshold: float,
         text_threshold: float
     ) -> sv.Detections:
-        """
-        import cv2
-        image = cv2.imread(IMAGE_PATH)
-        model = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)
-        detections = model.predict_with_classes(
-            image=image,
-            classes=CLASSES,
-            box_threshold=BOX_THRESHOLD,
-            text_threshold=TEXT_THRESHOLD
-        )
-        import supervision as sv
-        box_annotator = sv.BoxAnnotator()
-        annotated_image = box_annotator.annotate(scene=image, detections=detections)
-        """
         caption = ". ".join(classes)
         processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
         boxes, logits, phrases = predict(
@@ -256,7 +219,7 @@ class Model:
             logits: torch.Tensor
     ) -> sv.Detections:
         boxes = boxes * torch.Tensor([source_w, source_h, source_w, source_h])
-        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
         confidence = logits.numpy()
         return sv.Detections(xyxy=xyxy, confidence=confidence)
@@ -270,4 +233,4 @@ class Model:
                     break
             else:
                 class_ids.append(None)
-        return np.array(class_ids)

 import supervision as sv
 import torch
 from PIL import Image
 import bisect
 import groundingdino.datasets.transforms as T
 from groundingdino.util.slconfig import SLConfig
 from groundingdino.util.utils import get_phrases_from_posmap
+def cxcywh_to_xyxy(boxes: torch.Tensor) -> torch.Tensor:
+    """
+    Convert bounding boxes from [cx, cy, w, h] format to [x1, y1, x2, y2] format.
+    """
+    cx, cy, w, h = boxes.unbind(-1)
+    x1 = cx - 0.5 * w
+    y1 = cy - 0.5 * h
+    x2 = cx + 0.5 * w
+    y2 = cy + 0.5 * h
+    return torch.stack((x1, y1, x2, y2), dim=-1)
 # ----------------------------------------------------------------------------------------------------------------------
 # OLD API
 # ----------------------------------------------------------------------------------------------------------------------
     with torch.no_grad():
         outputs = model(image[None], captions=[caption])
+    prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0]
+    prediction_boxes = outputs["pred_boxes"].cpu()[0]
     mask = prediction_logits.max(dim=1)[0] > box_threshold
+    logits = prediction_logits[mask]
+    boxes = prediction_boxes[mask]
     tokenizer = model.tokenizer
     tokenized = tokenizer(caption)
     if remove_combined:
         sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
 def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
     h, w, _ = image_source.shape
     boxes = boxes * torch.Tensor([w, h, w, h])
+    xyxy = cxcywh_to_xyxy(boxes).numpy()
     detections = sv.Detections(xyxy=xyxy)
     labels = [
         box_threshold: float = 0.35,
         text_threshold: float = 0.25
     ) -> Tuple[sv.Detections, List[str]]:
         processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
         boxes, logits, phrases = predict(
             model=self.model,
         box_threshold: float,
         text_threshold: float
     ) -> sv.Detections:
         caption = ". ".join(classes)
         processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
         boxes, logits, phrases = predict(
             logits: torch.Tensor
     ) -> sv.Detections:
         boxes = boxes * torch.Tensor([source_w, source_h, source_w, source_h])
+        xyxy = cxcywh_to_xyxy(boxes).numpy()
         confidence = logits.numpy()
         return sv.Detections(xyxy=xyxy, confidence=confidence)
                     break
             else:
                 class_ids.append(None)
+        return np.array(class_ids)