qqc1989 commited on 16 days ago

Commit

ed861ec

verified ·

1 Parent(s): c694ad9

Upload 114 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
python/axmodels/feature_extractor.axmodel +3 -0
python/axmodels/motion_extractor.axmodel +3 -0
python/axmodels/spade_generator.axmodel +3 -0
python/axmodels/stitching_retargeting.axmodel +3 -0
python/axmodels/warp.onnx +3 -0
python/cropper.py +242 -0
python/infer.py +894 -0
python/infer_onnx.py +952 -0
python/requirements.txt +9 -0
python/utils/__init__.py +0 -0
python/utils/__pycache__/__init__.cpython-310.pyc +0 -0
python/utils/__pycache__/crop.cpython-310.pyc +0 -0
python/utils/__pycache__/human_landmark_runner.cpython-310.pyc +0 -0
python/utils/__pycache__/rprint.cpython-310.pyc +0 -0
python/utils/__pycache__/timer.cpython-310.pyc +0 -0
python/utils/crop.py +423 -0
python/utils/dependencies/XPose/config_model/UniPose_SwinT.py +125 -0
python/utils/dependencies/XPose/config_model/coco_transformer.py +8 -0
python/utils/dependencies/XPose/models/UniPose/__init__.py +10 -0
python/utils/dependencies/XPose/models/UniPose/attention.py +373 -0
python/utils/dependencies/XPose/models/UniPose/backbone.py +211 -0
python/utils/dependencies/XPose/models/UniPose/deformable_transformer.py +1230 -0
python/utils/dependencies/XPose/models/UniPose/fuse_modules.py +274 -0
python/utils/dependencies/XPose/models/UniPose/mask_generate.py +56 -0
python/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py +10 -0
python/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py +61 -0
python/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py +9 -0
python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py +142 -0
python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py +130 -0
python/utils/dependencies/XPose/models/UniPose/ops/setup.py +73 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp +41 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h +33 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu +153 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h +30 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh +1327 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h +62 -0
python/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp +16 -0
python/utils/dependencies/XPose/models/UniPose/ops/test.py +89 -0
python/utils/dependencies/XPose/models/UniPose/position_encoding.py +157 -0
python/utils/dependencies/XPose/models/UniPose/swin_transformer.py +701 -0
python/utils/dependencies/XPose/models/UniPose/transformer_deformable.py +595 -0
python/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py +102 -0
python/utils/dependencies/XPose/models/UniPose/unipose.py +621 -0
python/utils/dependencies/XPose/models/UniPose/utils.py +348 -0
python/utils/dependencies/XPose/models/__init__.py +16 -0
python/utils/dependencies/XPose/models/registry.py +58 -0
python/utils/dependencies/XPose/predefined_keypoints.py +56 -0
python/utils/dependencies/XPose/transforms.py +394 -0
python/utils/dependencies/XPose/util/addict.py +159 -0

.gitattributes CHANGED Viewed

@@ -52,3 +52,4 @@ assets/examples/source/s5.jpg filter=lfs diff=lfs merge=lfs -text
 assets/examples/source/s6.jpg filter=lfs diff=lfs merge=lfs -text
 assets/examples/source/s7.jpg filter=lfs diff=lfs merge=lfs -text
 assets/examples/source/s9.jpg filter=lfs diff=lfs merge=lfs -text

 assets/examples/source/s6.jpg filter=lfs diff=lfs merge=lfs -text
 assets/examples/source/s7.jpg filter=lfs diff=lfs merge=lfs -text
 assets/examples/source/s9.jpg filter=lfs diff=lfs merge=lfs -text
+python/utils/dependencies/insightface/data/images/t1.jpg filter=lfs diff=lfs merge=lfs -text

python/axmodels/feature_extractor.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79f688c174b4ff91ccd1b0a0869e2cad4ff962f914edb74a5c0a26a2d540cee9
+size 1543019

python/axmodels/motion_extractor.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10dfe954ba0c9d1ab31997ed8c142a7336fba58c8344635b0d91d0c6a0eae341
+size 38150196

python/axmodels/spade_generator.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5a097161c26b85dee1d4bafbcbdd1097a9b843414fae9c62a505221a37cc793
+size 63354167

python/axmodels/stitching_retargeting.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12bf1d10463da83ad90cbfabfc58002127bf39432436a105213a294147191cd7
+size 60571

python/axmodels/warp.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:160c613a5c8fc49d0a8aca1eacfa208642b673fa5b25e08853ac40548106b53c
+size 201167246

python/cropper.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from utils.dependencies.insightface.app import FaceAnalysis
+from utils.dependencies.insightface.app.common import Face
+from utils.timer import Timer
+from utils.human_landmark_runner import LandmarkRunner as HumanLandmark
+from utils.crop import crop_image
+from typing import List, Tuple, Union
+from dataclasses import dataclass, field
+import numpy as np
+import os.path as osp
+import cv2
+def contiguous(obj):
+    if not obj.flags.c_contiguous:
+        obj = obj.copy(order="C")
+    return obj
+@dataclass
+class Trajectory:
+    start: int = -1  # start frame
+    end: int = -1  # end frame
+    lmk_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # lmk list
+    bbox_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # bbox list
+    M_c2o_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # M_c2o list
+    frame_rgb_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # frame list
+    lmk_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # lmk list
+    frame_rgb_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list)  # frame crop list
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+def sort_by_direction(faces, direction: str = 'large-small', face_center=None):
+    if len(faces) <= 0:
+        return faces
+    if direction == 'left-right':
+        return sorted(faces, key=lambda face: face['bbox'][0])
+    if direction == 'right-left':
+        return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)
+    if direction == 'top-bottom':
+        return sorted(faces, key=lambda face: face['bbox'][1])
+    if direction == 'bottom-top':
+        return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)
+    if direction == 'small-large':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))
+    if direction == 'large-small':
+        return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True)
+    if direction == 'distance-from-retarget-face':
+        return sorted(faces, key=lambda face: (((face['bbox'][2]+face['bbox'][0])/2-face_center[0])**2+((face['bbox'][3]+face['bbox'][1])/2-face_center[1])**2)**0.5)
+    return faces
+class FaceAnalysisDIY(FaceAnalysis):
+    def __init__(self, name='buffalo_l', root='~/.insightface', allowed_modules=None, **kwargs):
+        super().__init__(name=name, root=root, allowed_modules=allowed_modules, **kwargs)
+        self.timer = Timer()
+    def get(self, img_bgr, **kwargs):
+        max_num = kwargs.get('max_face_num', 0)  # the number of the detected faces, 0 means no limit
+        flag_do_landmark_2d_106 = kwargs.get('flag_do_landmark_2d_106', True)  # whether to do 106-point detection
+        direction = kwargs.get('direction', 'large-small')  # sorting direction
+        face_center = None
+        bboxes, kpss = self.det_model.detect(img_bgr, max_num=max_num, metric='default')
+        if bboxes.shape[0] == 0:
+            return []
+        ret = []
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i, 0:4]
+            det_score = bboxes[i, 4]
+            kps = None
+            if kpss is not None:
+                kps = kpss[i]
+            face = Face(bbox=bbox, kps=kps, det_score=det_score)
+            for taskname, model in self.models.items():
+                if taskname == 'detection':
+                    continue
+                if (not flag_do_landmark_2d_106) and taskname == 'landmark_2d_106':
+                    continue
+                # print(f'taskname: {taskname}')
+                model.get(img_bgr, face)
+            ret.append(face)
+        ret = sort_by_direction(ret, direction, face_center)
+        return ret
+    def warmup(self):
+        self.timer.tic()
+        img_bgr = np.zeros((512, 512, 3), dtype=np.uint8)
+        self.get(img_bgr)
+        elapse = self.timer.toc()
+        print(f'FaceAnalysisDIY warmup time: {elapse:.3f}s')
+class Cropper(object):
+    def __init__(self, ):
+        self.face_analysis_wrapper_provider = ["CPUExecutionProvider"]
+        self.insightface_root: str = make_abs_path("./pretrained_weights/insightface")
+        self.device_id = 0
+        self.landmark_ckpt_path: str = make_abs_path("./pretrained_weights/liveportrait/landmark.onnx")
+        self.det_thresh: float = 0.1 # detection threshold
+        self.device = "cpu"
+        self.image_type = "human_face"
+        self.direction: str = "large-small"  # direction of cropping
+        self.max_face_num: int = 0  # max face number, 0 mean no limit
+        self.dsize: int = 512  # crop size
+        self.scale: float = 2.3  # scale factor
+        self.vx_ratio: float = 0  # vx ratio
+        self.vy_ratio: float = -0.125  # vy ratio +up, -down
+        self.flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True
+        self.face_analysis_wrapper = FaceAnalysisDIY(
+            name="buffalo_l",
+            root=self.insightface_root,
+            providers=self.face_analysis_wrapper_provider,
+        )
+        self.face_analysis_wrapper.prepare(ctx_id=self.device_id, det_size=(512, 512), det_thresh=self.det_thresh)
+        self.face_analysis_wrapper.warmup()
+        self.human_landmark_runner = HumanLandmark(
+            ckpt_path=self.landmark_ckpt_path,
+            onnx_provider=self.device,
+            device_id=self.device_id,
+        )
+        self.human_landmark_runner.warmup()
+    def crop_source_image(self, img_rgb_: np.ndarray):
+        # crop a source image and get neccessary information
+        img_rgb = img_rgb_.copy()  # copy it
+        img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+        if self.image_type == "human_face":
+            src_face = self.face_analysis_wrapper.get(
+                img_bgr,
+                flag_do_landmark_2d_106=True,
+                direction=self.direction,
+                max_face_num=self.max_face_num,
+            )
+            if len(src_face) == 0:
+                log("No face detected in the source image.")
+                return None
+            elif len(src_face) > 1:
+                log(f"More than one face detected in the image, only pick one face by rule {self.direction}.")
+            # NOTE: temporarily only pick the first face, to support multiple face in the future
+            src_face = src_face[0]
+            lmk = src_face.landmark_2d_106  # this is the 106 landmarks from insightface
+        else:
+            tmp_dct = {
+                'animal_face_9': 'animal_face',
+                'animal_face_68': 'face'
+            }
+            img_rgb_pil = Image.fromarray(img_rgb)
+            lmk = self.animal_landmark_runner.run(
+                img_rgb_pil,
+                'face',
+                tmp_dct[self.animal_face_type],
+                0,
+                0
+            )
+        # crop the face
+        ret_dct = crop_image(
+            img_rgb,  # ndarray
+            lmk,  # 106x2 or Nx2
+            dsize=self.dsize,
+            scale=self.scale,
+            vx_ratio=self.vx_ratio,
+            vy_ratio=self.vy_ratio,
+            flag_do_rot=self.flag_do_rot,
+        )
+        # update a 256x256 version for network input
+        ret_dct["img_crop_256x256"] = cv2.resize(ret_dct["img_crop"], (256, 256), interpolation=cv2.INTER_AREA)
+        cv2.imwrite("/data/tmp/yongqiang/LLM/projects/zr/liveportrait_onnx/img_crop.jpg", cv2.cvtColor(ret_dct["img_crop"], cv2.COLOR_BGR2RGB))
+        cv2.imwrite("/data/tmp/yongqiang/LLM/projects/zr/liveportrait_onnx/img_crop_256x256.jpg", cv2.cvtColor(ret_dct["img_crop_256x256"], cv2.COLOR_BGR2RGB))
+        if self.image_type == "human_face":
+            lmk = self.human_landmark_runner.run(img_rgb, lmk)
+            ret_dct["lmk_crop"] = lmk
+            ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / self.dsize
+        else:
+            # 68x2 or 9x2
+            ret_dct["lmk_crop"] = lmk
+        return ret_dct
+    def calc_lmk_from_cropped_image(self, img_rgb_, **kwargs):
+        direction = kwargs.get("direction", "large-small")
+        src_face = self.face_analysis_wrapper.get(
+            contiguous(img_rgb_[..., ::-1]),  # convert to BGR
+            flag_do_landmark_2d_106=True,
+            direction=direction,
+        )
+        if len(src_face) == 0:
+            log("No face detected in the source image.")
+            return None
+        elif len(src_face) > 1:
+            log(f"More than one face detected in the image, only pick one face by rule {direction}.")
+        src_face = src_face[0]
+        lmk = src_face.landmark_2d_106
+        lmk = self.human_landmark_runner.run(img_rgb_, lmk)
+        return lmk
+    def calc_lmks_from_cropped_video(self, driving_rgb_crop_lst, **kwargs):
+        """Tracking based landmarks/alignment"""
+        trajectory = Trajectory()
+        direction = kwargs.get("direction", "large-small")
+        for idx, frame_rgb_crop in enumerate(driving_rgb_crop_lst):
+            if idx == 0 or trajectory.start == -1:
+                src_face = self.face_analysis_wrapper.get(
+                    contiguous(frame_rgb_crop[..., ::-1]),  # convert to BGR
+                    flag_do_landmark_2d_106=True,
+                    direction=direction,
+                )
+                if len(src_face) == 0:
+                    log(f"No face detected in the frame #{idx}")
+                    raise Exception(f"No face detected in the frame #{idx}")
+                elif len(src_face) > 1:
+                    log(f"More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.")
+                src_face = src_face[0]
+                lmk = src_face.landmark_2d_106
+                lmk = self.human_landmark_runner.run(frame_rgb_crop, lmk)
+                trajectory.start, trajectory.end = idx, idx
+            else:
+                lmk = self.human_landmark_runner.run(frame_rgb_crop, trajectory.lmk_lst[-1])
+                trajectory.end = idx
+            trajectory.lmk_lst.append(lmk)
+        return trajectory.lmk_lst

python/infer.py ADDED Viewed

	@@ -0,0 +1,894 @@

+import argparse
+import cv2
+import numpy as np
+import os
+import onnxruntime as ort
+from axengine import InferenceSession
+import numpy as np
+import cv2
+import argparse
+import os.path as osp
+from loguru import logger
+from numpy import ndarray
+import pickle as pkl
+import torch
+import torch.nn.functional as F
+from cropper import Cropper
+import imageio
+import subprocess
+from utils.timer import Timer
+from typing import Union
+from scipy.spatial import ConvexHull # pylint: disable=E0401,E0611
+appearance_feature_extractor, motion_extractor, warping_module, spade_generator, stitching_retargeting_module = None, None, None, None, None
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="LivePortrait",
+        description="LivePortrait: A Real-time 3D Live Portrait Animation System"
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        required=True,
+        help="Path to source image.",
+    )
+    parser.add_argument(
+        "--driving",
+        type=str,
+        required=True,
+        help="Path to driving image.",
+    )
+    parser.add_argument(
+        "--models",
+        type=str,
+        required=True,
+        help="Path to onnx models.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./output",
+        help="Path to infer results.",
+    )
+    return parser.parse_args()
+def images2video(images, wfp, **kwargs):
+    fps = kwargs.get('fps', 30)
+    video_format = kwargs.get('format', 'mp4')  # default is mp4 format
+    codec = kwargs.get('codec', 'libx264')  # default is libx264 encoding
+    quality = kwargs.get('quality')  # video quality
+    pixelformat = kwargs.get('pixelformat', 'yuv420p')  # video pixel format
+    image_mode = kwargs.get('image_mode', 'rgb')
+    macro_block_size = kwargs.get('macro_block_size', 2)
+    ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))]
+    writer = imageio.get_writer(
+        wfp, fps=fps, format=video_format,
+        codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size
+    )
+    n = len(images)
+    for i in range(n):
+        if image_mode.lower() == 'bgr':
+            writer.append_data(images[i][..., ::-1])
+        else:
+            writer.append_data(images[i])
+    writer.close()
+def has_audio_stream(video_path: str) -> bool:
+    """
+    Check if the video file contains an audio stream.
+    :param video_path: Path to the video file
+    :return: True if the video contains an audio stream, False otherwise
+    """
+    if osp.isdir(video_path):
+        return False
+    cmd = [
+        'ffprobe',
+        '-v', 'error',
+        '-select_streams', 'a',
+        '-show_entries', 'stream=codec_type',
+        '-of', 'default=noprint_wrappers=1:nokey=1',
+        f'"{video_path}"'
+    ]
+    try:
+        # result = subprocess.run(cmd, capture_output=True, text=True)
+        result = exec_cmd(' '.join(cmd))
+        if result.returncode != 0:
+            logger.info(f"Error occurred while probing video: {result.stderr}")
+            return False
+        # Check if there is any output from ffprobe command
+        return bool(result.stdout.strip())
+    except Exception as e:
+        logger.info(
+            f"Error occurred while probing video: {video_path}, "
+            "you may need to install ffprobe! (https://ffmpeg.org/download.html) "
+            "Now set audio to false!",
+            style="bold red"
+        )
+    return False
+def tensor_to_numpy(data: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
+    """transform torch.Tensor into numpy.ndarray"""
+    if isinstance(data, torch.Tensor):
+        return data.data.cpu().numpy()
+    return data
+def calc_motion_multiplier(
+    kp_source: Union[np.ndarray, torch.Tensor],
+    kp_driving_initial: Union[np.ndarray, torch.Tensor]
+) -> float:
+    """calculate motion_multiplier based on the source image and the first driving frame"""
+    kp_source_np = tensor_to_numpy(kp_source)
+    kp_driving_initial_np = tensor_to_numpy(kp_driving_initial)
+    source_area = ConvexHull(kp_source_np.squeeze(0)).volume
+    driving_area = ConvexHull(kp_driving_initial_np.squeeze(0)).volume
+    motion_multiplier = np.sqrt(source_area) / np.sqrt(driving_area)
+    # motion_multiplier = np.cbrt(source_area) / np.cbrt(driving_area)
+    return motion_multiplier
+def load_video(video_info, n_frames=-1):
+    reader = imageio.get_reader(video_info, "ffmpeg")
+    ret = []
+    for idx, frame_rgb in enumerate(reader):
+        if n_frames > 0 and idx >= n_frames:
+            break
+        ret.append(frame_rgb)
+    reader.close()
+    return ret
+def fast_check_ffmpeg():
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        return True
+    except:
+        return False
+def is_video(file_path):
+    if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path):
+        return True
+    return False
+def is_image(file_path):
+    image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')
+    return file_path.lower().endswith(image_extensions)
+def get_fps(filepath, default_fps=25):
+    try:
+        fps = cv2.VideoCapture(filepath).get(cv2.CAP_PROP_FPS)
+        if fps in (0, None):
+            fps = default_fps
+    except Exception as e:
+        logger.info(e)
+        fps = default_fps
+    return fps
+def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray:
+    return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /
+            (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))
+def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:
+    lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)
+    righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)
+    if target_eye_ratio is not None:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)
+    else:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)
+def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:
+    return calculate_distance_ratio(lmk, 90, 102, 48, 66)
+def concat_frames(driving_image_lst, source_image_lst, I_p_lst):
+    # TODO: add more concat style, e.g., left-down corner driving
+    out_lst = []
+    h, w, _ = I_p_lst[0].shape
+    source_image_resized_lst = [cv2.resize(img, (w, h)) for img in source_image_lst]
+    for idx, _ in enumerate(I_p_lst):
+        I_p = I_p_lst[idx]
+        source_image_resized = source_image_resized_lst[idx] if len(source_image_lst) > 1 else source_image_resized_lst[0]
+        if driving_image_lst is None:
+            out = np.hstack((source_image_resized, I_p))
+        else:
+            driving_image = driving_image_lst[idx]
+            driving_image_resized = cv2.resize(driving_image, (w, h))
+            out = np.hstack((driving_image_resized, source_image_resized, I_p))
+        out_lst.append(out)
+    return out_lst
+def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """
+    kp_source: (bs, k, 3)
+    kp_driving: (bs, k, 3)
+    Return: (bs, 2k*3)
+    """
+    bs_src = kp_source.shape[0]
+    bs_dri = kp_driving.shape[0]
+    assert bs_src == bs_dri, 'batch size must be equal'
+    feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)
+    return feat
+DTYPE = np.float32
+CV2_INTERP = cv2.INTER_LINEAR
+def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
+    """ conduct similarity or affine transformation to the image, do not do border operation!
+    img:
+    M: 2x3 matrix or 3x3 matrix
+    dsize: target shape (width, height)
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+    if borderMode is not None:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
+    else:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
+def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
+    """prepare mask for later image paste back
+    """
+    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
+    mask_ori = mask_ori.astype(np.float32) / 255.
+    return mask_ori
+def paste_back(img_crop, M_c2o, img_ori, mask_ori):
+    """paste back the image
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    result = _transform_img(img_crop, M_c2o, dsize=dsize)
+    result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
+    return result
+def prefix(filename):
+    """a.jpg -> a"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return filename
+    return filename[:pos]
+def basename(filename):
+    """a/b/c.jpg -> c"""
+    return prefix(osp.basename(filename))
+def mkdir(d, log=False):
+    # return self-assined `d`, for one line code
+    if not osp.exists(d):
+        os.makedirs(d, exist_ok=True)
+        if log:
+            logger.info(f"Make dir: {d}")
+    return d
+def dct2device(dct: dict, device):
+    for key in dct:
+        if isinstance(dct[key], torch.Tensor):
+            dct[key] = dct[key].to(device)
+        else:
+            dct[key] = torch.tensor(dct[key]).to(device)
+    return dct
+PI = np.pi
+def headpose_pred_to_degree(pred):
+    """
+    pred: (bs, 66) or (bs, 1) or others
+    """
+    if pred.ndim > 1 and pred.shape[1] == 66:
+        # NOTE: note that the average is modified to 97.5
+        device = pred.device
+        idx_tensor = [idx for idx in range(0, 66)]
+        idx_tensor = torch.FloatTensor(idx_tensor).to(device)
+        pred = F.softmax(pred, dim=1)
+        degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5
+        return degree
+    return pred
+def get_rotation_matrix(pitch_, yaw_, roll_):
+    """ the input is in degree
+    """
+    # transform to radian
+    pitch = pitch_ / 180 * PI
+    yaw = yaw_ / 180 * PI
+    roll = roll_ / 180 * PI
+    device = pitch.device
+    if pitch.ndim == 1:
+        pitch = pitch.unsqueeze(1)
+    if yaw.ndim == 1:
+        yaw = yaw.unsqueeze(1)
+    if roll.ndim == 1:
+        roll = roll.unsqueeze(1)
+    # calculate the euler matrix
+    bs = pitch.shape[0]
+    ones = torch.ones([bs, 1]).to(device)
+    zeros = torch.zeros([bs, 1]).to(device)
+    x, y, z = pitch, yaw, roll
+    rot_x = torch.cat([
+        ones, zeros, zeros,
+        zeros, torch.cos(x), -torch.sin(x),
+        zeros, torch.sin(x), torch.cos(x)
+    ], dim=1).reshape([bs, 3, 3])
+    rot_y = torch.cat([
+        torch.cos(y), zeros, torch.sin(y),
+        zeros, ones, zeros,
+        -torch.sin(y), zeros, torch.cos(y)
+    ], dim=1).reshape([bs, 3, 3])
+    rot_z = torch.cat([
+        torch.cos(z), -torch.sin(z), zeros,
+        torch.sin(z), torch.cos(z), zeros,
+        zeros, zeros, ones
+    ], dim=1).reshape([bs, 3, 3])
+    rot = rot_z @ rot_y @ rot_x
+    return rot.permute(0, 2, 1)  # transpose
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+def load_image_rgb(image_path: str):
+    if not osp.exists(image_path):
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+def resize_to_limit(img: np.ndarray, max_dim=1920, division=2):
+    """
+    ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
+    :param img: the image to be processed.
+    :param max_dim: the maximum dimension constraint.
+    :param n: the number that needs to be multiples of.
+    :return: the adjusted image.
+    """
+    h, w = img.shape[:2]
+    # ajust the size of the image according to the maximum dimension
+    if max_dim > 0 and max(h, w) > max_dim:
+        if h > w:
+            new_h = max_dim
+            new_w = int(w * (max_dim / h))
+        else:
+            new_w = max_dim
+            new_h = int(h * (max_dim / w))
+        img = cv2.resize(img, (new_w, new_h))
+    # ensure that the image dimensions are multiples of n
+    division = max(division, 1)
+    new_h = img.shape[0] - (img.shape[0] % division)
+    new_w = img.shape[1] - (img.shape[1] % division)
+    if new_h == 0 or new_w == 0:
+        # when the width or height is less than n, no need to process
+        return img
+    if new_h != img.shape[0] or new_w != img.shape[1]:
+        img = img[:new_h, :new_w]
+    return img
+def preprocess(input_data):
+    img_rgb = load_image_rgb(input_data)
+    img_rgb = resize_to_limit(img_rgb)
+    return [img_rgb]
+def postprocess(output_data):
+    # Implement your postprocessing steps here
+    # For example, you might convert the output to a specific format
+    return output_data
+def infer(model, input_data):
+    input_name = model.get_inputs()[0].name
+    output_name = model.get_outputs()[0].name
+    input_data = preprocess(input_data) # rgb, resize & limit
+    result = model.run([output_name], {input_name: input_data})
+    return postprocess(result)
+def partial_fields(target_class, kwargs):
+    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
+def calc_ratio(lmk_lst):
+    input_eye_ratio_lst = []
+    input_lip_ratio_lst = []
+    for lmk in lmk_lst:
+        # for eyes retargeting
+        input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
+        # for lip retargeting
+        input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
+    return input_eye_ratio_lst, input_lip_ratio_lst
+def prepare_videos(imgs) -> torch.Tensor:
+    """ construct the input as standard
+    imgs: NxBxHxWx3, uint8
+    """
+    device = "cpu"
+    if isinstance(imgs, list):
+        _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+    elif isinstance(imgs, np.ndarray):
+        _imgs = imgs
+    else:
+        raise ValueError(f'imgs type error: {type(imgs)}')
+    y = _imgs.astype(np.float32) / 255.
+    y = np.clip(y, 0, 1)  # clip to 0~1
+    y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+    y = y.to(device)
+    return y
+def get_kp_info(x: torch.Tensor) -> dict:
+    """ get the implicit keypoint information
+    x: Bx3xHxW, normalized to 0~1
+    flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
+    return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
+    """
+    outs = motion_extractor.run(None, input_feed={"input": x.numpy()}) # TODO: axengine 中的 run 输入参数与 ort 还是些许不同
+    # import pdb; pdb.set_trace()
+    # outs = list(outs.values())
+    kp_info = {}
+    kp_info['pitch'] = torch.from_numpy(outs[0])
+    kp_info['yaw'] = torch.from_numpy(outs[1])
+    kp_info['roll'] = torch.from_numpy(outs[2])
+    kp_info['t'] = torch.from_numpy(outs[3])
+    kp_info['exp'] = torch.from_numpy(outs[4])
+    kp_info['scale'] = torch.from_numpy(outs[5])
+    kp_info['kp'] = torch.from_numpy(outs[6])
+    flag_refine_info: bool = True
+    if flag_refine_info:
+        bs = kp_info['kp'].shape[0]
+        kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1
+        kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1
+        kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1
+        kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3
+        kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3
+    return kp_info
+def transform_keypoint(kp_info: dict):
+    """
+    transform the implicit keypoints with the pose, shift, and expression deformation
+    kp: BxNx3
+    """
+    kp = kp_info['kp']    # (bs, k, 3)
+    pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+    t, exp = kp_info['t'], kp_info['exp']
+    scale = kp_info['scale']
+    pitch = headpose_pred_to_degree(pitch)
+    yaw = headpose_pred_to_degree(yaw)
+    roll = headpose_pred_to_degree(roll)
+    bs = kp.shape[0]
+    if kp.ndim == 2:
+        num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+    else:
+        num_kp = kp.shape[1]  # Bxnum_kpx3
+    rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3), 欧拉角转换为旋转矩阵
+    # Eqn.2: s * (R * x_c,s + exp) + t
+    kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
+    kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+    kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+    return kp_transformed
+def make_motion_template(I_lst, c_eyes_lst, c_lip_lst, **kwargs):
+    n_frames = I_lst.shape[0]
+    template_dct = {
+        'n_frames': n_frames,
+        'output_fps': kwargs.get('output_fps', 25),
+        'motion': [],
+        'c_eyes_lst': [],
+        'c_lip_lst': [],
+    }
+    for i in range(n_frames):
+        # collect s, R, δ and t for inference
+        I_i = I_lst[i]
+        x_i_info = get_kp_info(I_i)
+        x_s = transform_keypoint(x_i_info)
+        R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
+        item_dct = {
+            'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
+            'R': R_i.cpu().numpy().astype(np.float32),
+            'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
+            't': x_i_info['t'].cpu().numpy().astype(np.float32),
+            'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
+            'x_s': x_s.cpu().numpy().astype(np.float32),
+        }
+        template_dct['motion'].append(item_dct)
+        c_eyes = c_eyes_lst[i].astype(np.float32)
+        template_dct['c_eyes_lst'].append(c_eyes)
+        c_lip = c_lip_lst[i].astype(np.float32)
+        template_dct['c_lip_lst'].append(c_lip)
+    return template_dct
+def prepare_source(img: np.ndarray) -> torch.Tensor:
+    """ construct the input as standard
+    img: HxWx3, uint8, 256x256
+    """
+    device = "cpu"
+    h, w = img.shape[:2]
+    x = img.copy()
+    if x.ndim == 3:
+        x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1
+    elif x.ndim == 4:
+        x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1
+    else:
+        raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
+    x = np.clip(x, 0, 1)  # clip to 0~1
+    x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW
+    x = x.to(device)
+    return x
+def extract_feature_3d(x: torch.Tensor) -> torch.Tensor:
+    """ get the appearance feature of the image by F
+    x: Bx3xHxW, normalized to 0~1
+    """
+    outs = appearance_feature_extractor.run(None, input_feed={"input": x.numpy()})[0]
+    # outs = list(outs.values())[0]
+    # import pdb; pdb.set_trace()
+    return torch.from_numpy(outs)
+def stitch(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """
+    kp_source: BxNx3
+    kp_driving: BxNx3
+    Return: Bx(3*num_kp+2)
+    """
+    feat_stiching = concat_feat(kp_source, kp_driving)
+    delta = stitching_retargeting_module.run(None, input_feed={"input": feat_stiching.numpy()})[0]
+    # delta = list(delta.values())[0]
+    return torch.from_numpy(delta)
+def stitching(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """ conduct the stitching
+    kp_source: Bxnum_kpx3
+    kp_driving: Bxnum_kpx3
+    """
+    bs, num_kp = kp_source.shape[:2]
+    kp_driving_new = kp_driving.clone()
+    delta = stitch(kp_source, kp_driving_new)
+    delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+    delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+    kp_driving_new += delta_exp
+    kp_driving_new[..., :2] += delta_tx_ty
+    return kp_driving_new
+def warp_decode(feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """ get the image after the warping of the implicit keypoints
+    feature_3d: Bx32x16x64x64, feature volume
+    kp_source: BxNx3
+    kp_driving: BxNx3
+    """
+    warp_timer = Timer()
+    warp_timer.tic()
+    outs = warping_module.run([], {"feature_3d": feature_3d.numpy(), "kp_driving": kp_driving.numpy(), "kp_source": kp_source.numpy()})[2]
+    warp_timer.toc()
+    logger.debug(f'warp time: {warp_timer.diff:.3f}s')
+    # outs = warping_module.run(input_feed={"feature_3d": feature_3d.numpy(), "kp_driving": kp_driving.numpy(), "kp_source": kp_source.numpy()})['out']
+    outs = spade_generator.run(None, input_feed={"input":  outs})[0]
+    # outs = list(outs.values())[0]
+    ret_dct = {}
+    ret_dct['out'] = torch.from_numpy(outs)
+    return ret_dct
+def parse_output(out: torch.Tensor) -> np.ndarray:
+    """ construct the output as standard
+    return: 1xHxWx3, uint8
+    """
+    out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3
+    out = np.clip(out, 0, 1)  # clip to 0~1
+    out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255
+    return out
+def load_model(model_type, model_path=None):
+    if model_type == 'appearance_feature_extractor':
+        model = InferenceSession(f"{model_path}/feature_extractor.axmodel")
+    elif model_type == 'motion_extractor':
+        model = InferenceSession(f'{model_path}/motion_extractor.axmodel')
+    elif model_type == 'warping_module':
+        model = ort.InferenceSession(f'{model_path}/warp.onnx', providers=["CPUExecutionProvider"])
+        # model = InferenceSession(f'{model_path}/warp.axmodel')
+    elif model_type == 'spade_generator':
+        model = InferenceSession(f'{model_path}/spade_generator.axmodel')
+    elif model_type == 'stitching_retargeting_module':
+        model = InferenceSession(f'{model_path}/stitching_retargeting.axmodel')
+    return model
+def main():
+    args = parse_args()
+    global appearance_feature_extractor
+    appearance_feature_extractor = load_model("appearance_feature_extractor", args.models)
+    global motion_extractor
+    motion_extractor = load_model("motion_extractor", args.models)
+    global warping_module
+    warping_module = load_model("warping_module", args.models)
+    global spade_generator
+    spade_generator = load_model("spade_generator", args.models)
+    global stitching_retargeting_module
+    stitching_retargeting_module = load_model("stitching_retargeting_module", args.models)
+    source = args.source
+    driving = args.driving
+    ffmpeg_dir = os.path.join(os.getcwd(), "ffmpeg")
+    if osp.exists(ffmpeg_dir):
+        os.environ["PATH"] += (os.pathsep + ffmpeg_dir)
+    if not fast_check_ffmpeg():
+        raise ImportError(
+            "FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html"
+        )
+    source_rgb_lst = preprocess(source)  # rgb, resize & limit
+    if is_video(args.driving):
+        flag_is_driving_video = True
+        # load from video file, AND make motion template
+        output_fps = int(get_fps(args.driving))
+        driving_rgb_lst = load_video(args.driving)
+    elif is_image(args.driving):
+        flag_is_driving_video = False
+        output_fps = 25
+        driving_rgb_lst = [load_image_rgb(driving)] # rgb
+    else:
+        raise Exception(f"{args.driving} is not a supported type!")
+    ######## make motion template ########
+    cropper: Cropper = Cropper()
+    logger.info("Start making driving motion template...")
+    driving_n_frames = len(driving_rgb_lst)
+    n_frames = driving_n_frames
+    driving_lmk_crop_lst = cropper.calc_lmks_from_cropped_video(driving_rgb_lst) # cropper.
+    driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]  # force to resize to 256x256
+    #######################################
+    c_d_eyes_lst, c_d_lip_lst = calc_ratio(driving_lmk_crop_lst)
+    # save the motion template
+    I_d_lst = prepare_videos(driving_rgb_crop_256x256_lst)
+    driving_template_dct = make_motion_template(I_d_lst, c_d_eyes_lst, c_d_lip_lst, output_fps=output_fps)
+    # wfp_template = remove_suffix(args.driving) + '.pkl'
+    # dump(wfp_template, driving_template_dct)
+    # logger.info(f"Dump motion template to {wfp_template}")
+    if not flag_is_driving_video:
+        c_d_eyes_lst = c_d_eyes_lst * n_frames
+        c_d_lip_lst = c_d_lip_lst * n_frames
+    I_p_pstbk_lst = []
+    logger.info("Prepared pasteback mask done.")
+    I_p_lst = []
+    R_d_0, x_d_0_info = None, None
+    flag_normalize_lip = False # inf_cfg.flag_normalize_lip  # not overwrite
+    flag_source_video_eye_retargeting = False # inf_cfg.flag_source_video_eye_retargeting  # not overwrite
+    lip_delta_before_animation, eye_delta_before_animation = None, None
+    ######## process source info ########
+    # if the input is a source image, process it only once
+    flag_do_crop = True
+    if flag_do_crop:
+        crop_info = cropper.crop_source_image(source_rgb_lst[0])
+        if crop_info is None:
+            raise Exception("No face detected in the source image!")
+        source_lmk = crop_info['lmk_crop']
+        img_crop_256x256 = crop_info['img_crop_256x256']
+    else:
+        source_lmk = cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
+        img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256))  # force to resize to 256x256
+    I_s = prepare_source(img_crop_256x256)
+    x_s_info = get_kp_info(I_s)
+    x_c_s = x_s_info['kp']
+    R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+    f_s = extract_feature_3d(I_s)
+    x_s = transform_keypoint(x_s_info)
+    # let lip-open scalar to be 0 at first
+    mask_crop: ndarray = cv2.imread(make_abs_path('./utils/resources/mask_template.png'), cv2.IMREAD_COLOR)
+    mask_ori_float = prepare_paste_back(mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))
+    with open(make_abs_path('./utils/resources/lip_array.pkl'), 'rb') as f:
+        lip_array = pkl.load(f)
+    device = "cpu"
+    flag_is_source_video = False
+    ######## animate ########
+    if flag_is_driving_video: #  or (flag_is_source_video and not flag_is_driving_video)
+        logger.info(f"The animated video consists of {n_frames} frames.")
+    else:
+        logger.info(f"The output of image-driven portrait animation is an image.")
+    for i in range(n_frames):
+        x_d_i_info = driving_template_dct['motion'][i]
+        x_d_i_info = dct2device(x_d_i_info, device)
+        R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+        if i == 0:  # cache the first frame
+            R_d_0 = R_d_i
+            x_d_0_info = x_d_i_info.copy()
+        delta_new = x_s_info['exp'].clone()
+        R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+        if flag_is_driving_video:
+            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+        else:
+            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
+        # delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
+        scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+        t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+        t_new[..., 2].fill_(0)  # zero tz
+        x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+        if i == 0 and flag_is_driving_video:
+            x_d_0_new = x_d_i_new
+            motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+            # motion_multiplier *= inf_cfg.driving_multiplier
+            x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+            x_d_i_new = x_d_diff + x_s
+        # Algorithm 1:
+        # with stitching and without retargeting
+        x_d_i_new = stitching(x_s, x_d_i_new)
+        x_d_i_new = x_s + (x_d_i_new - x_s) * 1.0
+        out = warp_decode(f_s, x_s, x_d_i_new)
+        I_p_i = parse_output(out['out'])[0]
+        I_p_lst.append(I_p_i)
+        I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], source_rgb_lst[0], mask_ori_float)
+        I_p_pstbk_lst.append(I_p_pstbk)
+    mkdir(args.output_dir)
+    wfp_concat = None
+    ######### build the final concatenation result #########
+    # driving frame | source frame | generation
+    frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
+    if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
+        flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)
+        flag_driving_has_audio = has_audio_stream(args.driving)
+        wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')
+        # NOTE: update output fps
+        output_fps = source_fps if flag_is_source_video else output_fps
+        images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)
+        if flag_source_has_audio or flag_driving_has_audio:
+            # final result with concatenation
+            wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
+            audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
+            logger.info(f"Audio is selected from {audio_from_which_video}, concat mode")
+            add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
+            os.replace(wfp_concat_with_audio, wfp_concat)
+            logger.info(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
+        # save the animated result
+        wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')
+        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+            images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)
+        else:
+            images2video(I_p_lst, wfp=wfp, fps=output_fps)
+        ######### build the final result #########
+        if flag_source_has_audio or flag_driving_has_audio:
+            wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
+            audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
+            logger.info(f"Audio is selected from {audio_from_which_video}")
+            add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
+            os.replace(wfp_with_audio, wfp)
+            logger.info(f"Replace {wfp_with_audio} with {wfp}")
+        # final log
+        # if wfp_template not in (None, ''):
+        #     logger.info(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')
+        logger.info(f'Animated video: {wfp}')
+        logger.info(f'Animated video with concat: {wfp_concat}')
+    else:
+        wfp_concat = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}_concat.jpg')
+        cv2.imwrite(wfp_concat, frames_concatenated[0][..., ::-1])
+        wfp = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}.jpg')
+        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+            cv2.imwrite(wfp, I_p_pstbk_lst[0][..., ::-1])
+        else:
+            cv2.imwrite(wfp, frames_concatenated[0][..., ::-1])
+        # final log
+        logger.info(f'Animated image: {wfp}')
+        logger.info(f'Animated image with concat: {wfp_concat}')
+if __name__ == "__main__":
+    """
+    Usage:
+        python3 infer.py --source ../assets/examples/source/s0.jpg --driving ../assets/examples/driving/d8.jpg --models ./axmdoels --output-dir ./axmodel_infer
+    """
+    timer = Timer()
+    timer.tic()
+    main()
+    elapse = timer.toc()
+    logger.debug(f'LivePortrait axmodel infer time: {elapse:.3f}s')

python/infer_onnx.py ADDED Viewed

	@@ -0,0 +1,952 @@

+import argparse
+import cv2
+import numpy as np
+import os
+import onnxruntime as ort
+import numpy as np
+import cv2
+import argparse
+import os.path as osp
+from loguru import logger
+from numpy import ndarray
+import pickle as pkl
+import torch
+import torch.nn.functional as F
+from cropper import Cropper
+import imageio
+import subprocess
+from utils.timer import Timer
+from typing import Union
+from scipy.spatial import ConvexHull # pylint: disable=E0401,E0611
+appearance_feature_extractor, motion_extractor, warping_module, spade_generator, stitching_retargeting_module = None, None, None, None, None
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="LivePortrait",
+        description="LivePortrait: A Real-time 3D Live Portrait Animation System"
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        required=True,
+        help="Path to source image.",
+    )
+    parser.add_argument(
+        "--driving",
+        type=str,
+        required=True,
+        help="Path to driving image.",
+    )
+    parser.add_argument(
+        "--models",
+        type=str,
+        required=True,
+        help="Path to onnx models.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./output",
+        help="Path to infer results.",
+    )
+    return parser.parse_args()
+def images2video(images, wfp, **kwargs):
+    fps = kwargs.get('fps', 30)
+    video_format = kwargs.get('format', 'mp4')  # default is mp4 format
+    codec = kwargs.get('codec', 'libx264')  # default is libx264 encoding
+    quality = kwargs.get('quality')  # video quality
+    pixelformat = kwargs.get('pixelformat', 'yuv420p')  # video pixel format
+    image_mode = kwargs.get('image_mode', 'rgb')
+    macro_block_size = kwargs.get('macro_block_size', 2)
+    ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))]
+    writer = imageio.get_writer(
+        wfp, fps=fps, format=video_format,
+        codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size
+    )
+    n = len(images)
+    for i in range(n):
+        if image_mode.lower() == 'bgr':
+            writer.append_data(images[i][..., ::-1])
+        else:
+            writer.append_data(images[i])
+    writer.close()
+def is_template(file_path):
+    if file_path.endswith(".pkl"):
+        return True
+    return False
+def has_audio_stream(video_path: str) -> bool:
+    """
+    Check if the video file contains an audio stream.
+    :param video_path: Path to the video file
+    :return: True if the video contains an audio stream, False otherwise
+    """
+    if osp.isdir(video_path):
+        return False
+    cmd = [
+        'ffprobe',
+        '-v', 'error',
+        '-select_streams', 'a',
+        '-show_entries', 'stream=codec_type',
+        '-of', 'default=noprint_wrappers=1:nokey=1',
+        f'"{video_path}"'
+    ]
+    try:
+        # result = subprocess.run(cmd, capture_output=True, text=True)
+        result = exec_cmd(' '.join(cmd))
+        if result.returncode != 0:
+            logger.info(f"Error occurred while probing video: {result.stderr}")
+            return False
+        # Check if there is any output from ffprobe command
+        return bool(result.stdout.strip())
+    except Exception as e:
+        logger.info(
+            f"Error occurred while probing video: {video_path}, "
+            "you may need to install ffprobe! (https://ffmpeg.org/download.html) "
+            "Now set audio to false!",
+            style="bold red"
+        )
+    return False
+def tensor_to_numpy(data: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
+    """transform torch.Tensor into numpy.ndarray"""
+    if isinstance(data, torch.Tensor):
+        return data.data.cpu().numpy()
+    return data
+def calc_motion_multiplier(
+    kp_source: Union[np.ndarray, torch.Tensor],
+    kp_driving_initial: Union[np.ndarray, torch.Tensor]
+) -> float:
+    """calculate motion_multiplier based on the source image and the first driving frame"""
+    kp_source_np = tensor_to_numpy(kp_source)
+    kp_driving_initial_np = tensor_to_numpy(kp_driving_initial)
+    source_area = ConvexHull(kp_source_np.squeeze(0)).volume
+    driving_area = ConvexHull(kp_driving_initial_np.squeeze(0)).volume
+    motion_multiplier = np.sqrt(source_area) / np.sqrt(driving_area)
+    # motion_multiplier = np.cbrt(source_area) / np.cbrt(driving_area)
+    return motion_multiplier
+def load_video(video_info, n_frames=-1):
+    reader = imageio.get_reader(video_info, "ffmpeg")
+    ret = []
+    for idx, frame_rgb in enumerate(reader):
+        if n_frames > 0 and idx >= n_frames:
+            break
+        ret.append(frame_rgb)
+    reader.close()
+    return ret
+def fast_check_ffmpeg():
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        return True
+    except:
+        return False
+def is_video(file_path):
+    if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path):
+        return True
+    return False
+def is_image(file_path):
+    image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')
+    return file_path.lower().endswith(image_extensions)
+def get_fps(filepath, default_fps=25):
+    try:
+        fps = cv2.VideoCapture(filepath).get(cv2.CAP_PROP_FPS)
+        if fps in (0, None):
+            fps = default_fps
+    except Exception as e:
+        logger.info(e)
+        fps = default_fps
+    return fps
+def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray:
+    return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /
+            (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))
+def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:
+    lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)
+    righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)
+    if target_eye_ratio is not None:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)
+    else:
+        return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)
+def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:
+    return calculate_distance_ratio(lmk, 90, 102, 48, 66)
+def concat_frames(driving_image_lst, source_image_lst, I_p_lst):
+    # TODO: add more concat style, e.g., left-down corner driving
+    out_lst = []
+    h, w, _ = I_p_lst[0].shape
+    source_image_resized_lst = [cv2.resize(img, (w, h)) for img in source_image_lst]
+    for idx, _ in enumerate(I_p_lst):
+        I_p = I_p_lst[idx]
+        source_image_resized = source_image_resized_lst[idx] if len(source_image_lst) > 1 else source_image_resized_lst[0]
+        if driving_image_lst is None:
+            out = np.hstack((source_image_resized, I_p))
+        else:
+            driving_image = driving_image_lst[idx]
+            driving_image_resized = cv2.resize(driving_image, (w, h))
+            out = np.hstack((driving_image_resized, source_image_resized, I_p))
+        out_lst.append(out)
+    return out_lst
+def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """
+    kp_source: (bs, k, 3)
+    kp_driving: (bs, k, 3)
+    Return: (bs, 2k*3)
+    """
+    bs_src = kp_source.shape[0]
+    bs_dri = kp_driving.shape[0]
+    assert bs_src == bs_dri, 'batch size must be equal'
+    feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)
+    return feat
+DTYPE = np.float32
+CV2_INTERP = cv2.INTER_LINEAR
+def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
+    """ conduct similarity or affine transformation to the image, do not do border operation!
+    img:
+    M: 2x3 matrix or 3x3 matrix
+    dsize: target shape (width, height)
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+    if borderMode is not None:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
+    else:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
+def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
+    """prepare mask for later image paste back
+    """
+    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
+    mask_ori = mask_ori.astype(np.float32) / 255.
+    return mask_ori
+def paste_back(img_crop, M_c2o, img_ori, mask_ori):
+    """paste back the image
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    result = _transform_img(img_crop, M_c2o, dsize=dsize)
+    result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
+    return result
+def prefix(filename):
+    """a.jpg -> a"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return filename
+    return filename[:pos]
+def basename(filename):
+    """a/b/c.jpg -> c"""
+    return prefix(osp.basename(filename))
+def mkdir(d, log=False):
+    # return self-assined `d`, for one line code
+    if not osp.exists(d):
+        os.makedirs(d, exist_ok=True)
+        if log:
+            logger.info(f"Make dir: {d}")
+    return d
+def dct2device(dct: dict, device):
+    for key in dct:
+        if isinstance(dct[key], torch.Tensor):
+            dct[key] = dct[key].to(device)
+        else:
+            dct[key] = torch.tensor(dct[key]).to(device)
+    return dct
+PI = np.pi
+def headpose_pred_to_degree(pred):
+    """
+    pred: (bs, 66) or (bs, 1) or others
+    """
+    if pred.ndim > 1 and pred.shape[1] == 66:
+        # NOTE: note that the average is modified to 97.5
+        device = pred.device
+        idx_tensor = [idx for idx in range(0, 66)]
+        idx_tensor = torch.FloatTensor(idx_tensor).to(device)
+        pred = F.softmax(pred, dim=1)
+        degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5
+        return degree
+    return pred
+def get_rotation_matrix(pitch_, yaw_, roll_):
+    """ the input is in degree
+    """
+    # transform to radian
+    pitch = pitch_ / 180 * PI
+    yaw = yaw_ / 180 * PI
+    roll = roll_ / 180 * PI
+    device = pitch.device
+    if pitch.ndim == 1:
+        pitch = pitch.unsqueeze(1)
+    if yaw.ndim == 1:
+        yaw = yaw.unsqueeze(1)
+    if roll.ndim == 1:
+        roll = roll.unsqueeze(1)
+    # calculate the euler matrix
+    bs = pitch.shape[0]
+    ones = torch.ones([bs, 1]).to(device)
+    zeros = torch.zeros([bs, 1]).to(device)
+    x, y, z = pitch, yaw, roll
+    rot_x = torch.cat([
+        ones, zeros, zeros,
+        zeros, torch.cos(x), -torch.sin(x),
+        zeros, torch.sin(x), torch.cos(x)
+    ], dim=1).reshape([bs, 3, 3])
+    rot_y = torch.cat([
+        torch.cos(y), zeros, torch.sin(y),
+        zeros, ones, zeros,
+        -torch.sin(y), zeros, torch.cos(y)
+    ], dim=1).reshape([bs, 3, 3])
+    rot_z = torch.cat([
+        torch.cos(z), -torch.sin(z), zeros,
+        torch.sin(z), torch.cos(z), zeros,
+        zeros, zeros, ones
+    ], dim=1).reshape([bs, 3, 3])
+    rot = rot_z @ rot_y @ rot_x
+    return rot.permute(0, 2, 1)  # transpose
+def suffix(filename):
+    """a.jpg -> jpg"""
+    pos = filename.rfind(".")
+    if pos == -1:
+        return ""
+    return filename[pos + 1:]
+def remove_suffix(filepath):
+    """a/b/c.jpg -> a/b/c"""
+    return osp.join(osp.dirname(filepath), basename(filepath))
+def load(fp):
+    suffix_ = suffix(fp)
+    if suffix_ == "npy":
+        return np.load(fp)
+    elif suffix_ == "pkl":
+        return pkl.load(open(fp, "rb"))
+    else:
+        raise Exception(f"Unknown type: {suffix}")
+def dump(wfp, obj):
+    wd = osp.split(wfp)[0]
+    if wd != "" and not osp.exists(wd):
+        mkdir(wd)
+    _suffix = suffix(wfp)
+    if _suffix == "npy":
+        np.save(wfp, obj)
+    elif _suffix == "pkl":
+        pkl.dump(obj, open(wfp, "wb"))
+    else:
+        raise Exception("Unknown type: {}".format(_suffix))
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+def load_image_rgb(image_path: str):
+    if not osp.exists(image_path):
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+def resize_to_limit(img: np.ndarray, max_dim=1920, division=2):
+    """
+    ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
+    :param img: the image to be processed.
+    :param max_dim: the maximum dimension constraint.
+    :param n: the number that needs to be multiples of.
+    :return: the adjusted image.
+    """
+    h, w = img.shape[:2]
+    # ajust the size of the image according to the maximum dimension
+    if max_dim > 0 and max(h, w) > max_dim:
+        if h > w:
+            new_h = max_dim
+            new_w = int(w * (max_dim / h))
+        else:
+            new_w = max_dim
+            new_h = int(h * (max_dim / w))
+        img = cv2.resize(img, (new_w, new_h))
+    # ensure that the image dimensions are multiples of n
+    division = max(division, 1)
+    new_h = img.shape[0] - (img.shape[0] % division)
+    new_w = img.shape[1] - (img.shape[1] % division)
+    if new_h == 0 or new_w == 0:
+        # when the width or height is less than n, no need to process
+        return img
+    if new_h != img.shape[0] or new_w != img.shape[1]:
+        img = img[:new_h, :new_w]
+    return img
+def preprocess(input_data):
+    img_rgb = load_image_rgb(input_data)
+    img_rgb = resize_to_limit(img_rgb)
+    return [img_rgb]
+def postprocess(output_data):
+    # Implement your postprocessing steps here
+    # For example, you might convert the output to a specific format
+    return output_data
+def infer(model, input_data):
+    input_name = model.get_inputs()[0].name
+    output_name = model.get_outputs()[0].name
+    input_data = preprocess(input_data) # rgb, resize & limit
+    result = model.run([output_name], {input_name: input_data})
+    return postprocess(result)
+def partial_fields(target_class, kwargs):
+    return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
+def calc_ratio(lmk_lst):
+    input_eye_ratio_lst = []
+    input_lip_ratio_lst = []
+    for lmk in lmk_lst:
+        # for eyes retargeting
+        input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
+        # for lip retargeting
+        input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
+    return input_eye_ratio_lst, input_lip_ratio_lst
+def prepare_videos(imgs) -> torch.Tensor:
+    """ construct the input as standard
+    imgs: NxBxHxWx3, uint8
+    """
+    device = "cpu"
+    if isinstance(imgs, list):
+        _imgs = np.array(imgs)[..., np.newaxis]  # TxHxWx3x1
+    elif isinstance(imgs, np.ndarray):
+        _imgs = imgs
+    else:
+        raise ValueError(f'imgs type error: {type(imgs)}')
+    y = _imgs.astype(np.float32) / 255.
+    y = np.clip(y, 0, 1)  # clip to 0~1
+    y = torch.from_numpy(y).permute(0, 4, 3, 1, 2)  # TxHxWx3x1 -> Tx1x3xHxW
+    y = y.to(device)
+    return y
+def get_kp_info(x: torch.Tensor) -> dict:
+    """ get the implicit keypoint information
+    x: Bx3xHxW, normalized to 0~1
+    flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
+    return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
+    """
+    outs = motion_extractor.run([], input_feed={"input": x.numpy()}) # TODO: axengine 中的 run 输入参数与 ort 还是些许不同
+    kp_info = {}
+    kp_info['pitch'] = torch.from_numpy(outs[0])
+    kp_info['yaw'] = torch.from_numpy(outs[1])
+    kp_info['roll'] = torch.from_numpy(outs[2])
+    kp_info['t'] = torch.from_numpy(outs[3])
+    kp_info['exp'] = torch.from_numpy(outs[4])
+    kp_info['scale'] = torch.from_numpy(outs[5])
+    kp_info['kp'] = torch.from_numpy(outs[6])
+    flag_refine_info: bool = True
+    if flag_refine_info:
+        bs = kp_info['kp'].shape[0]
+        kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None]  # Bx1
+        kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None]  # Bx1
+        kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None]  # Bx1
+        kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3)  # BxNx3
+        kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3)  # BxNx3
+    return kp_info
+def transform_keypoint(kp_info: dict):
+    """
+    transform the implicit keypoints with the pose, shift, and expression deformation
+    kp: BxNx3
+    """
+    kp = kp_info['kp']    # (bs, k, 3)
+    pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+    t, exp = kp_info['t'], kp_info['exp']
+    scale = kp_info['scale']
+    pitch = headpose_pred_to_degree(pitch)
+    yaw = headpose_pred_to_degree(yaw)
+    roll = headpose_pred_to_degree(roll)
+    bs = kp.shape[0]
+    if kp.ndim == 2:
+        num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+    else:
+        num_kp = kp.shape[1]  # Bxnum_kpx3
+    rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3), 欧拉角转换为旋转矩阵
+    # Eqn.2: s * (R * x_c,s + exp) + t
+    kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
+    kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+    kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+    return kp_transformed
+def make_motion_template(I_lst, c_eyes_lst, c_lip_lst, **kwargs):
+    n_frames = I_lst.shape[0]
+    template_dct = {
+        'n_frames': n_frames,
+        'output_fps': kwargs.get('output_fps', 25),
+        'motion': [],
+        'c_eyes_lst': [],
+        'c_lip_lst': [],
+    }
+    for i in range(n_frames):
+        # collect s, R, δ and t for inference
+        I_i = I_lst[i]
+        x_i_info = get_kp_info(I_i)
+        x_s = transform_keypoint(x_i_info)
+        R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
+        item_dct = {
+            'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
+            'R': R_i.cpu().numpy().astype(np.float32),
+            'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
+            't': x_i_info['t'].cpu().numpy().astype(np.float32),
+            'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
+            'x_s': x_s.cpu().numpy().astype(np.float32),
+        }
+        template_dct['motion'].append(item_dct)
+        c_eyes = c_eyes_lst[i].astype(np.float32)
+        template_dct['c_eyes_lst'].append(c_eyes)
+        c_lip = c_lip_lst[i].astype(np.float32)
+        template_dct['c_lip_lst'].append(c_lip)
+    return template_dct
+def prepare_source(img: np.ndarray) -> torch.Tensor:
+    """ construct the input as standard
+    img: HxWx3, uint8, 256x256
+    """
+    device = "cpu"
+    h, w = img.shape[:2]
+    x = img.copy()
+    if x.ndim == 3:
+        x = x[np.newaxis].astype(np.float32) / 255.  # HxWx3 -> 1xHxWx3, normalized to 0~1
+    elif x.ndim == 4:
+        x = x.astype(np.float32) / 255.  # BxHxWx3, normalized to 0~1
+    else:
+        raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
+    x = np.clip(x, 0, 1)  # clip to 0~1
+    x = torch.from_numpy(x).permute(0, 3, 1, 2)  # 1xHxWx3 -> 1x3xHxW
+    x = x.to(device)
+    return x
+def extract_feature_3d(x: torch.Tensor) -> torch.Tensor:
+    """ get the appearance feature of the image by F
+    x: Bx3xHxW, normalized to 0~1
+    """
+    outs = appearance_feature_extractor.run([], input_feed={"input": x.numpy()})[0]
+    return torch.from_numpy(outs)
+def stitch(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """
+    kp_source: BxNx3
+    kp_driving: BxNx3
+    Return: Bx(3*num_kp+2)
+    """
+    feat_stiching = concat_feat(kp_source, kp_driving)
+    delta = stitching_retargeting_module.run([], input_feed={"input": feat_stiching.numpy()})[0]
+    return torch.from_numpy(delta)
+def stitching(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """ conduct the stitching
+    kp_source: Bxnum_kpx3
+    kp_driving: Bxnum_kpx3
+    """
+    bs, num_kp = kp_source.shape[:2]
+    kp_driving_new = kp_driving.clone()
+    delta = stitch(kp_source, kp_driving_new)
+    delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+    delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+    kp_driving_new += delta_exp
+    kp_driving_new[..., :2] += delta_tx_ty
+    return kp_driving_new
+def warp_decode(feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+    """ get the image after the warping of the implicit keypoints
+    feature_3d: Bx32x16x64x64, feature volume
+    kp_source: BxNx3
+    kp_driving: BxNx3
+    """
+    outs = warping_module.run([], {"feature_3d": feature_3d.numpy(), "kp_driving": kp_driving.numpy(), "kp_source": kp_source.numpy()})[2]
+    outs = spade_generator.run([], input_feed={"input":  outs})[0]
+    ret_dct = {}
+    ret_dct['out'] = torch.from_numpy(outs)
+    return ret_dct
+def parse_output(out: torch.Tensor) -> np.ndarray:
+    """ construct the output as standard
+    return: 1xHxWx3, uint8
+    """
+    out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1])  # 1x3xHxW -> 1xHxWx3
+    out = np.clip(out, 0, 1)  # clip to 0~1
+    out = np.clip(out * 255, 0, 255).astype(np.uint8)  # 0~1 -> 0~255
+    return out
+def load_model(model_type, model_path=None):
+    if model_type == 'appearance_feature_extractor':
+        model = ort.InferenceSession(f"{model_path}/feature_extractor.onnx", providers=["CPUExecutionProvider"])
+    elif model_type == 'motion_extractor':
+        model = ort.InferenceSession(f'{model_path}/motion_extractor.onnx', providers=["CPUExecutionProvider"])
+    elif model_type == 'warping_module':
+        model = ort.InferenceSession(f'{model_path}/warp.onnx', providers=["CPUExecutionProvider"])
+    elif model_type == 'spade_generator':
+        model = ort.InferenceSession(f'{model_path}/spade_generator.onnx', providers=["CPUExecutionProvider"])
+    elif model_type == 'stitching_retargeting_module':
+        model = ort.InferenceSession(f'{model_path}/stitching_retargeting.onnx', providers=["CPUExecutionProvider"])
+    return model
+def main():
+    args = parse_args()
+    global appearance_feature_extractor
+    appearance_feature_extractor = load_model("appearance_feature_extractor", args.models)
+    global motion_extractor
+    motion_extractor = load_model("motion_extractor", args.models)
+    global warping_module
+    warping_module = load_model("warping_module", args.models)
+    global spade_generator
+    spade_generator = load_model("spade_generator", args.models)
+    global stitching_retargeting_module
+    stitching_retargeting_module = load_model("stitching_retargeting_module", args.models)
+    source = args.source
+    driving = args.driving
+    ffmpeg_dir = os.path.join(os.getcwd(), "ffmpeg")
+    if osp.exists(ffmpeg_dir):
+        os.environ["PATH"] += (os.pathsep + ffmpeg_dir)
+    if not fast_check_ffmpeg():
+        raise ImportError(
+            "FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html"
+        )
+    source_rgb_lst = preprocess(source)  # rgb, resize & limit
+    ######## process driving info ########
+    flag_load_from_template = is_template(args.driving)
+    driving_rgb_crop_256x256_lst = None
+    wfp_template = None
+    device = "cpu"
+    flag_is_source_video = False
+    cropper: Cropper = Cropper()
+    if flag_load_from_template:
+        # NOTE: load from template, it is fast, but the cropping video is None
+        logger.info(f"Load from template: {args.driving}, NOT the video, so the cropping video and audio are both NULL.", style='bold green')
+        driving_template_dct = load(args.driving)
+        c_d_eyes_lst = driving_template_dct['c_eyes_lst'] if 'c_eyes_lst' in driving_template_dct.keys() else driving_template_dct['c_d_eyes_lst'] # compatible with previous keys
+        c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']
+        driving_n_frames = driving_template_dct['n_frames']
+        flag_is_driving_video = True if driving_n_frames > 1 else False
+        if flag_is_source_video and flag_is_driving_video:
+            n_frames = min(len(source_rgb_lst), driving_n_frames)  # minimum number as the number of the animated frames
+        elif flag_is_source_video and not flag_is_driving_video:
+            n_frames = len(source_rgb_lst)
+        else:
+            n_frames = driving_n_frames
+        # set output_fps
+        output_fps = driving_template_dct.get('output_fps', 25)
+        logger.info(f'The FPS of template: {output_fps}')
+        flag_crop_driving_video = False
+        if flag_crop_driving_video:
+            logger.info("Warning: flag_crop_driving_video is True, but the driving info is a template, so it is ignored.")
+    elif osp.exists(args.driving):
+        if is_video(args.driving):
+            flag_is_driving_video = True
+            # load from video file, AND make motion template
+            output_fps = int(get_fps(args.driving))
+            driving_rgb_lst = load_video(args.driving)
+        elif is_image(args.driving):
+            flag_is_driving_video = False
+            output_fps = 25
+            driving_rgb_lst = [load_image_rgb(driving)] # rgb
+        else:
+            raise Exception(f"{args.driving} is not a supported type!")
+        ######## make motion template ########
+        logger.info("Start making driving motion template...")
+        driving_n_frames = len(driving_rgb_lst)
+        n_frames = driving_n_frames
+        driving_lmk_crop_lst = cropper.calc_lmks_from_cropped_video(driving_rgb_lst) # cropper.
+        driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst]  # force to resize to 256x256
+        #######################################
+        c_d_eyes_lst, c_d_lip_lst = calc_ratio(driving_lmk_crop_lst)
+        # save the motion template
+        I_d_lst = prepare_videos(driving_rgb_crop_256x256_lst)
+        driving_template_dct = make_motion_template(I_d_lst, c_d_eyes_lst, c_d_lip_lst, output_fps=output_fps)
+        wfp_template = remove_suffix(args.driving) + '.pkl'
+        dump(wfp_template, driving_template_dct)
+        logger.info(f"Dump motion template to {wfp_template}")
+    else:
+        raise Exception(f"{args.driving} does not exist!")
+    if not flag_is_driving_video:
+        c_d_eyes_lst = c_d_eyes_lst * n_frames
+        c_d_lip_lst = c_d_lip_lst * n_frames
+    I_p_pstbk_lst = []
+    logger.info("Prepared pasteback mask done.")
+    I_p_lst = []
+    R_d_0, x_d_0_info = None, None
+    flag_normalize_lip = False # inf_cfg.flag_normalize_lip  # not overwrite
+    flag_source_video_eye_retargeting = False # inf_cfg.flag_source_video_eye_retargeting  # not overwrite
+    lip_delta_before_animation, eye_delta_before_animation = None, None
+    ######## process source info ########
+    # if the input is a source image, process it only once
+    flag_do_crop = True
+    if flag_do_crop:
+        crop_info = cropper.crop_source_image(source_rgb_lst[0])
+        if crop_info is None:
+            raise Exception("No face detected in the source image!")
+        source_lmk = crop_info['lmk_crop']
+        img_crop_256x256 = crop_info['img_crop_256x256']
+    else:
+        source_lmk = cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
+        img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256))  # force to resize to 256x256
+    I_s = prepare_source(img_crop_256x256)
+    x_s_info = get_kp_info(I_s)
+    x_c_s = x_s_info['kp']
+    R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
+    f_s = extract_feature_3d(I_s)
+    x_s = transform_keypoint(x_s_info)
+    # let lip-open scalar to be 0 at first
+    mask_crop: ndarray = cv2.imread(make_abs_path('./utils/resources/mask_template.png'), cv2.IMREAD_COLOR)
+    mask_ori_float = prepare_paste_back(mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))
+    with open(make_abs_path('./utils/resources/lip_array.pkl'), 'rb') as f:
+        lip_array = pkl.load(f)
+    ######## animate ########
+    if flag_is_driving_video: #  or (flag_is_source_video and not flag_is_driving_video)
+        logger.info(f"The animated video consists of {n_frames} frames.")
+    else:
+        logger.info(f"The output of image-driven portrait animation is an image.")
+    for i in range(n_frames):
+        x_d_i_info = driving_template_dct['motion'][i]
+        x_d_i_info = dct2device(x_d_i_info, device)
+        R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d']  # compatible with previous keys
+        if i == 0:  # cache the first frame
+            R_d_0 = R_d_i
+            x_d_0_info = x_d_i_info.copy()
+        delta_new = x_s_info['exp'].clone()
+        R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
+        if flag_is_driving_video:
+            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
+        else:
+            delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
+        # delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
+        scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
+        t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
+        t_new[..., 2].fill_(0)  # zero tz
+        x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
+        if i == 0 and flag_is_driving_video:
+            x_d_0_new = x_d_i_new
+            motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
+            # motion_multiplier *= inf_cfg.driving_multiplier
+            x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
+            x_d_i_new = x_d_diff + x_s
+        # Algorithm 1:
+        # with stitching and without retargeting
+        x_d_i_new = stitching(x_s, x_d_i_new)
+        x_d_i_new = x_s + (x_d_i_new - x_s) * 1.0
+        out = warp_decode(f_s, x_s, x_d_i_new)
+        I_p_i = parse_output(out['out'])[0]
+        I_p_lst.append(I_p_i)
+        I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], source_rgb_lst[0], mask_ori_float)
+        I_p_pstbk_lst.append(I_p_pstbk)
+    mkdir(args.output_dir)
+    wfp_concat = None
+    ######### build the final concatenation result #########
+    # driving frame | source frame | generation
+    frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
+    if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
+        flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)
+        flag_driving_has_audio = has_audio_stream(args.driving)
+        wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')
+        # NOTE: update output fps
+        output_fps = source_fps if flag_is_source_video else output_fps
+        images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)
+        if flag_source_has_audio or flag_driving_has_audio:
+            # final result with concatenation
+            wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
+            audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
+            logger.info(f"Audio is selected from {audio_from_which_video}, concat mode")
+            add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
+            os.replace(wfp_concat_with_audio, wfp_concat)
+            logger.info(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
+        # save the animated result
+        wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')
+        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+            images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)
+        else:
+            images2video(I_p_lst, wfp=wfp, fps=output_fps)
+        ######### build the final result #########
+        if flag_source_has_audio or flag_driving_has_audio:
+            wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
+            audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
+            logger.info(f"Audio is selected from {audio_from_which_video}")
+            add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
+            os.replace(wfp_with_audio, wfp)
+            logger.info(f"Replace {wfp_with_audio} with {wfp}")
+        # final log
+        if wfp_template not in (None, ''):
+            logger.info(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')
+        logger.info(f'Animated video: {wfp}')
+        logger.info(f'Animated video with concat: {wfp_concat}')
+    else:
+        wfp_concat = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}_concat.jpg')
+        cv2.imwrite(wfp_concat, frames_concatenated[0][..., ::-1])
+        wfp = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}.jpg')
+        if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
+            cv2.imwrite(wfp, I_p_pstbk_lst[0][..., ::-1])
+        else:
+            cv2.imwrite(wfp, frames_concatenated[0][..., ::-1])
+        # final log
+        logger.info(f'Animated image: {wfp}')
+        logger.info(f'Animated image with concat: {wfp_concat}')
+if __name__ == "__main__":
+    """
+    Usage:
+        python3 infer_onnx.py --source ../assets/examples/source/s0.jpg --driving ../assets/examples/driving/d8.jpg --models onnx-models --output-dir output
+    """
+    timer = Timer()
+    timer.tic()
+    main()
+    elapse = timer.toc()
+    logger.debug(f'LivePortrait onnx infer time: {elapse:.3f}s')

python/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+onnx
+onnxruntime
+opencv-python
+torch
+torchvision
+numpy
+loguru
+imageio[ffmpeg]
+ffprobe-python

python/utils/__init__.py ADDED Viewed

File without changes

python/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (156 Bytes). View file

python/utils/__pycache__/crop.cpython-310.pyc ADDED Viewed

Binary file (10 kB). View file

python/utils/__pycache__/human_landmark_runner.cpython-310.pyc ADDED Viewed

Binary file (2.87 kB). View file

python/utils/__pycache__/rprint.cpython-310.pyc ADDED Viewed

Binary file (368 Bytes). View file

python/utils/__pycache__/timer.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

python/utils/crop.py ADDED Viewed

	@@ -0,0 +1,423 @@

+# coding: utf-8
+"""
+cropping function and the related preprocess functions for cropping
+"""
+import numpy as np
+import os.path as osp
+from math import sin, cos, acos, degrees
+import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) # NOTE: enforce single thread
+from .rprint import rprint as print
+DTYPE = np.float32
+CV2_INTERP = cv2.INTER_LINEAR
+def make_abs_path(fn):
+    return osp.join(osp.dirname(osp.realpath(__file__)), fn)
+def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
+    """ conduct similarity or affine transformation to the image, do not do border operation!
+    img:
+    M: 2x3 matrix or 3x3 matrix
+    dsize: target shape (width, height)
+    """
+    if isinstance(dsize, tuple) or isinstance(dsize, list):
+        _dsize = tuple(dsize)
+    else:
+        _dsize = (dsize, dsize)
+    if borderMode is not None:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
+    else:
+        return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
+def _transform_pts(pts, M):
+    """ conduct similarity or affine transformation to the pts
+    pts: Nx2 ndarray
+    M: 2x3 matrix or 3x3 matrix
+    return: Nx2
+    """
+    return pts @ M[:2, :2].T + M[:2, 2]
+def parse_pt2_from_pt101(pt101, use_lip=True):
+    """
+    parsing the 2 points according to the 101 points, which cancels the roll
+    """
+    # the former version use the eye center, but it is not robust, now use interpolation
+    pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0)  # right eye center
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt101[75] + pt101[81]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+def parse_pt2_from_pt106(pt106, use_lip=True):
+    """
+    parsing the 2 points according to the 106 points, which cancels the roll
+    """
+    pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0)  # right eye center
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt106[52] + pt106[61]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+def parse_pt2_from_pt203(pt203, use_lip=True):
+    """
+    parsing the 2 points according to the 203 points, which cancels the roll
+    """
+    pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0)  # left eye center
+    pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0)  # right eye center
+    if use_lip:
+        # use lip
+        pt_center_eye = (pt_left_eye + pt_right_eye) / 2
+        pt_center_lip = (pt203[48] + pt203[66]) / 2
+        pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
+    else:
+        pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
+    return pt2
+def parse_pt2_from_pt68(pt68, use_lip=True):
+    """
+    parsing the 2 points according to the 68 points, which cancels the roll
+    """
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1
+    if use_lip:
+        pt5 = np.stack([
+            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye
+            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye
+            pt68[lm_idx[0], :],  # nose
+            pt68[lm_idx[5], :],  # lip
+            pt68[lm_idx[6], :]   # lip
+        ], axis=0)
+        pt2 = np.stack([
+            (pt5[0] + pt5[1]) / 2,
+            (pt5[3] + pt5[4]) / 2
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            np.mean(pt68[lm_idx[[1, 2]], :], 0),  # left eye
+            np.mean(pt68[lm_idx[[3, 4]], :], 0),  # right eye
+        ], axis=0)
+    return pt2
+def parse_pt2_from_pt5(pt5, use_lip=True):
+    """
+    parsing the 2 points according to the 5 points, which cancels the roll
+    """
+    if use_lip:
+        pt2 = np.stack([
+            (pt5[0] + pt5[1]) / 2,
+            (pt5[3] + pt5[4]) / 2
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            pt5[0],
+            pt5[1]
+        ], axis=0)
+    return pt2
+def parse_pt2_from_pt9(pt9, use_lip=True):
+    '''
+    parsing the 2 points according to the 9 points, which cancels the roll
+    ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip']
+    '''
+    if use_lip:
+        pt9 = np.stack([
+            (pt9[2] + pt9[3]) / 2, # left eye
+            (pt9[0] + pt9[1]) / 2, # right eye
+            pt9[4],
+            (pt9[5] + pt9[6] ) / 2 # lip
+        ], axis=0)
+        pt2 = np.stack([
+            (pt9[0] + pt9[1]) / 2, # eye
+            pt9[3] # lip
+        ], axis=0)
+    else:
+        pt2 = np.stack([
+            (pt9[2] + pt9[3]) / 2,
+            (pt9[0] + pt9[1]) / 2,
+        ], axis=0)
+    return pt2
+def parse_pt2_from_pt_x(pts, use_lip=True):
+    if pts.shape[0] == 101:
+        pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip)
+    elif pts.shape[0] == 106:
+        pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip)
+    elif pts.shape[0] == 68:
+        pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip)
+    elif pts.shape[0] == 5:
+        pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip)
+    elif pts.shape[0] == 203:
+        pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip)
+    elif pts.shape[0] > 101:
+        # take the first 101 points
+        pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip)
+    elif pts.shape[0] == 9:
+        pt2 = parse_pt2_from_pt9(pts, use_lip=use_lip)
+    else:
+        raise Exception(f'Unknow shape: {pts.shape}')
+    if not use_lip:
+        # NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually
+        v = pt2[1] - pt2[0]
+        pt2[1, 0] = pt2[0, 0] - v[1]
+        pt2[1, 1] = pt2[0, 1] + v[0]
+    return pt2
+def parse_rect_from_landmark(
+    pts,
+    scale=1.5,
+    need_square=True,
+    vx_ratio=0,
+    vy_ratio=0,
+    use_deg_flag=False,
+    **kwargs
+):
+    """parsing center, size, angle from 101/68/5/x landmarks
+    vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size
+    vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area
+    judge with pts.shape
+    """
+    pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True))
+    uy = pt2[1] - pt2[0]
+    l = np.linalg.norm(uy)
+    if l <= 1e-3:
+        uy = np.array([0, 1], dtype=DTYPE)
+    else:
+        uy /= l
+    ux = np.array((uy[1], -uy[0]), dtype=DTYPE)
+    # the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system)
+    # print(uy)
+    # print(ux)
+    angle = acos(ux[0])
+    if ux[1] < 0:
+        angle = -angle
+    # rotation matrix
+    M = np.array([ux, uy])
+    # calculate the size which contains the angle degree of the bbox, and the center
+    center0 = np.mean(pts, axis=0)
+    rpts = (pts - center0) @ M.T  # (M @ P.T).T = P @ M.T
+    lt_pt = np.min(rpts, axis=0)
+    rb_pt = np.max(rpts, axis=0)
+    center1 = (lt_pt + rb_pt) / 2
+    size = rb_pt - lt_pt
+    if need_square:
+        m = max(size[0], size[1])
+        size[0] = m
+        size[1] = m
+    size *= scale  # scale size
+    center = center0 + ux * center1[0] + uy * center1[1]  # counterclockwise rotation, equivalent to M.T @ center1.T
+    center = center + ux * (vx_ratio * size) + uy * \
+        (vy_ratio * size)  # considering the offset in vx and vy direction
+    if use_deg_flag:
+        angle = degrees(angle)
+    return center, size, angle
+def parse_bbox_from_landmark(pts, **kwargs):
+    center, size, angle = parse_rect_from_landmark(pts, **kwargs)
+    cx, cy = center
+    w, h = size
+    # calculate the vertex positions before rotation
+    bbox = np.array([
+        [cx-w/2, cy-h/2],  # left, top
+        [cx+w/2, cy-h/2],
+        [cx+w/2, cy+h/2],  # right, bottom
+        [cx-w/2, cy+h/2]
+    ], dtype=DTYPE)
+    # construct rotation matrix
+    bbox_rot = bbox.copy()
+    R = np.array([
+        [np.cos(angle), -np.sin(angle)],
+        [np.sin(angle),  np.cos(angle)]
+    ], dtype=DTYPE)
+    # calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center
+    bbox_rot = (bbox_rot - center) @ R.T + center
+    return {
+        'center': center,  # 2x1
+        'size': size,  # scalar
+        'angle': angle,  # rad, counterclockwise
+        'bbox': bbox,  # 4x2
+        'bbox_rot': bbox_rot,  # 4x2
+    }
+def crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs):
+    left, top, right, bot = bbox
+    if int(right - left) != int(bot - top):
+        print(f'right-left {right-left} != bot-top {bot-top}')
+    size = right - left
+    src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE)
+    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)
+    s = dsize / size  # scale
+    if flag_rot and angle is not None:
+        costheta, sintheta = cos(angle), sin(angle)
+        cx, cy = src_center[0], src_center[1]  # ori center
+        tcx, tcy = tgt_center[0], tgt_center[1]  # target center
+        # need to infer
+        M_o2c = np.array(
+            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
+             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
+            dtype=DTYPE
+        )
+    else:
+        M_o2c = np.array(
+            [[s, 0, tgt_center[0] - s * src_center[0]],
+             [0, s, tgt_center[1] - s * src_center[1]]],
+            dtype=DTYPE
+        )
+    # if flag_rot and angle is None:
+        # print('angle is None, but flag_rotate is True', style="bold yellow")
+    img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None))
+    lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None
+    M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)])
+    M_c2o = np.linalg.inv(M_o2c)
+    # cv2.imwrite('crop.jpg', img_crop)
+    return {
+        'img_crop': img_crop,
+        'lmk_crop': lmk_crop,
+        'M_o2c': M_o2c,
+        'M_c2o': M_c2o,
+    }
+def _estimate_similar_transform_from_pts(
+    pts,
+    dsize,
+    scale=1.5,
+    vx_ratio=0,
+    vy_ratio=-0.1,
+    flag_do_rot=True,
+    **kwargs
+):
+    """ calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image
+    pts: landmark, 101 or 68 points or other points, Nx2
+    scale: the larger scale factor, the smaller face ratio
+    vx_ratio: x shift
+    vy_ratio: y shift, the smaller the y shift, the lower the face region
+    rot_flag: if it is true, conduct correction
+    """
+    center, size, angle = parse_rect_from_landmark(
+        pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio,
+        use_lip=kwargs.get('use_lip', True)
+    )
+    s = dsize / size[0]  # scale
+    tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)  # center of dsize
+    if flag_do_rot:
+        costheta, sintheta = cos(angle), sin(angle)
+        cx, cy = center[0], center[1]  # ori center
+        tcx, tcy = tgt_center[0], tgt_center[1]  # target center
+        # need to infer
+        M_INV = np.array(
+            [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
+             [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
+            dtype=DTYPE
+        )
+    else:
+        M_INV = np.array(
+            [[s, 0, tgt_center[0] - s * center[0]],
+             [0, s, tgt_center[1] - s * center[1]]],
+            dtype=DTYPE
+        )
+    M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])])
+    M = np.linalg.inv(M_INV_H)
+    # M_INV is from the original image to the cropped image, M is from the cropped image to the original image
+    return M_INV, M[:2, ...]
+def crop_image(img, pts: np.ndarray, **kwargs):
+    dsize = kwargs.get('dsize', 224)
+    scale = kwargs.get('scale', 1.5)  # 1.5 | 1.6
+    vy_ratio = kwargs.get('vy_ratio', -0.1)  # -0.0625 | -0.1
+    M_INV, _ = _estimate_similar_transform_from_pts(
+        pts,
+        dsize=dsize,
+        scale=scale,
+        vy_ratio=vy_ratio,
+        flag_do_rot=kwargs.get('flag_do_rot', True),
+    )
+    img_crop = _transform_img(img, M_INV, dsize)  # origin to crop
+    pt_crop = _transform_pts(pts, M_INV)
+    M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])
+    M_c2o = np.linalg.inv(M_o2c)
+    ret_dct = {
+        'M_o2c': M_o2c,  # from the original image to the cropped image 3x3
+        'M_c2o': M_c2o,  # from the cropped image to the original image 3x3
+        'img_crop': img_crop,  # the cropped image
+        'pt_crop': pt_crop,  # the landmarks of the cropped image
+    }
+    return ret_dct
+def average_bbox_lst(bbox_lst):
+    if len(bbox_lst) == 0:
+        return None
+    bbox_arr = np.array(bbox_lst)
+    return np.mean(bbox_arr, axis=0).tolist()
+def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
+    """prepare mask for later image paste back
+    """
+    mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
+    mask_ori = mask_ori.astype(np.float32) / 255.
+    return mask_ori
+def paste_back(img_crop, M_c2o, img_ori, mask_ori):
+    """paste back the image
+    """
+    dsize = (img_ori.shape[1], img_ori.shape[0])
+    result = _transform_img(img_crop, M_c2o, dsize=dsize)
+    result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
+    return result

python/utils/dependencies/XPose/config_model/UniPose_SwinT.py ADDED Viewed

	@@ -0,0 +1,125 @@

+_base_ = ['coco_transformer.py']
+use_label_enc = True
+num_classes=2
+lr = 0.0001
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 12
+lr_drop = 11
+save_checkpoint_interval = 100
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = False
+lr_drop_list = [33, 45]
+modelname = 'UniPose'
+frozen_weights = None
+backbone = 'swin_T_224_1k'
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+unic_layers = 0
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+pdetr3_bbox_embed_diff_each_layer = False
+pdetr3_refHW = -1
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dabdetr_yolo_like_anchor_update = False
+dabdetr_deformable_encoder = False
+dabdetr_deformable_decoder = False
+use_deformable_box_attn = False
+box_attn_type = 'roi_align'
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+decoder_layer_noise = False
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+add_channel_attention = False
+add_pos_value = False
+two_stage_type = 'standard'
+two_stage_pat_embed = 0
+two_stage_add_query_num = 0
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+masks = False
+decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
+matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+dec_pred_bbox_embed_share = True
+dec_pred_class_embed_share = True
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef=1.0
+dn_bbox_coef=1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+match_unstable_error = True
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+use_detached_boxes_dec_out = False
+max_text_len = 256
+shuffle_type = None
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = False # True
+use_transformer_ckpt = True
+text_encoder_type = 'bert-base-uncased'
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+num_body_points=68
+binary_query_selection = False
+use_cdn = True
+ffn_extra_layernorm = False
+fix_size=False

python/utils/dependencies/XPose/config_model/coco_transformer.py ADDED Viewed

	@@ -0,0 +1,8 @@

+data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+data_aug_max_size = 1333
+data_aug_scales2_resize = [400, 500, 600]
+data_aug_scales2_crop = [384, 600]
+data_aug_scale_overlap = None

python/utils/dependencies/XPose/models/UniPose/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+from .unipose import build_unipose

python/utils/dependencies/XPose/models/UniPose/attention.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from codes in torch.nn
+# ------------------------------------------------------------------------
+"""
+MultiheadAttention that support query, key, and value to have different dimensions.
+Query, key, and value projections are removed.
+Mostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873
+and https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837
+"""
+import warnings
+import torch
+from torch.nn.modules.linear import Linear
+from torch.nn.init import constant_
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+try:
+    from torch.overrides import has_torch_function, handle_torch_function
+except:
+    from torch._overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+Tensor = torch.Tensor
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference: Attention Is All You Need
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        vdim = vdim if vdim is not None else embed_dim
+        self.out_proj = Linear(vdim , vdim)
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+        self.add_zero_attn = add_zero_attn
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.out_proj.bias, 0.)
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+        super(MultiheadAttention, self).__setstate__(state)
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=True, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, out_dim=self.vdim)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, out_dim=self.vdim)
+def multi_head_attention_forward(query: Tensor,
+                                 key: Tensor,
+                                 value: Tensor,
+                                 embed_dim_to_check: int,
+                                 num_heads: int,
+                                 in_proj_weight: Tensor,
+                                 in_proj_bias: Tensor,
+                                 bias_k: Optional[Tensor],
+                                 bias_v: Optional[Tensor],
+                                 add_zero_attn: bool,
+                                 dropout_p: float,
+                                 out_proj_weight: Tensor,
+                                 out_proj_bias: Tensor,
+                                 training: bool = True,
+                                 key_padding_mask: Optional[Tensor] = None,
+                                 need_weights: bool = True,
+                                 attn_mask: Optional[Tensor] = None,
+                                 use_separate_proj_weight: bool = False,
+                                 q_proj_weight: Optional[Tensor] = None,
+                                 k_proj_weight: Optional[Tensor] = None,
+                                 v_proj_weight: Optional[Tensor] = None,
+                                 static_k: Optional[Tensor] = None,
+                                 static_v: Optional[Tensor] = None,
+                                 out_dim: Optional[Tensor] = None
+                                 ) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    if not torch.jit.is_scripting():
+        tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+            return handle_torch_function(
+                multi_head_attention_forward, tens_ops, query, key, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+    head_dim = embed_dim // num_heads
+    v_head_dim = out_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+    q = query * scaling
+    k = key
+    v = value
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == v_head_dim
+        v = static_v
+    src_len = k.size(1)
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
+    # attn_output_weights = softmax(
+    #     attn_output_weights, dim=-1)
+    attn_output_weights = softmax(
+            attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)
+    attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
+    attn_output = torch.bmm(attn_output_weights, v)
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / num_heads
+    else:
+        return attn_output, None

python/utils/dependencies/XPose/models/UniPose/backbone.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Backbone modules.
+"""
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from util.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+from .swin_transformer import build_swin_transformer
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(
+        self,
+        backbone: nn.Module,
+        train_backbone: bool,
+        num_channels: int,
+        return_interm_indices: list,
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+        return_layers = {}
+        for idx, layer_index in enumerate(return_interm_indices):
+            return_layers.update(
+                {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
+            )
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.num_channels = num_channels
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        # import ipdb; ipdb.set_trace()
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        dilation: bool,
+        return_interm_indices: list,
+        batch_norm=FrozenBatchNorm2d,
+    ):
+        if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
+            backbone = getattr(torchvision.models, name)(
+                replace_stride_with_dilation=[False, False, dilation],
+                pretrained=is_main_process(),
+                norm_layer=batch_norm,
+            )
+        else:
+            raise NotImplementedError("Why you can get here with name {}".format(name))
+        # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
+        assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
+        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+        num_channels_all = [256, 512, 1024, 2048]
+        num_channels = num_channels_all[4 - len(return_interm_indices) :]
+        super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    """
+    Useful args:
+        - backbone: backbone name
+        - lr_backbone:
+        - dilation
+        - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+        - backbone_freeze_keywords:
+        - use_checkpoint: for swin only for now
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = True
+    if not train_backbone:
+        raise ValueError("Please set lr_backbone > 0")
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+    args.backbone_freeze_keywords
+    use_checkpoint = getattr(args, "use_checkpoint", False)
+    if args.backbone in ["resnet50", "resnet101"]:
+        backbone = Backbone(
+            args.backbone,
+            train_backbone,
+            args.dilation,
+            return_interm_indices,
+            batch_norm=FrozenBatchNorm2d,
+        )
+        bb_num_channels = backbone.num_channels
+    elif args.backbone in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]:
+        pretrain_img_size = int(args.backbone.split("_")[-2])
+        backbone = build_swin_transformer(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=False,
+            use_checkpoint=use_checkpoint,
+        )
+        bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
+    else:
+        raise NotImplementedError("Unknown backbone {}".format(args.backbone))
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels, List
+    ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
+    return model

python/utils/dependencies/XPose/models/UniPose/deformable_transformer.py ADDED Viewed

	@@ -0,0 +1,1230 @@

+# ------------------------------------------------------------------------
+# UniPose
+# url: https://github.com/IDEA-Research/UniPose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+import math
+import copy
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn, Tensor
+from typing import Optional
+from util.misc import inverse_sigmoid
+from .transformer_vanilla import TransformerEncoderLayer
+from .fuse_modules import BiAttentionBlock
+from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed
+from .ops.modules import MSDeformAttn
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_queries=300,
+                 num_encoder_layers=6,
+                 num_unicoder_layers=0,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048, dropout=0.0,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False, query_dim=4,
+                 num_patterns=0,
+                 modulate_hw_attn=False,
+                 # for deformable encoder
+                 deformable_encoder=False,
+                 deformable_decoder=False,
+                 num_feature_levels=1,
+                 enc_n_points=4,
+                 dec_n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 # init query
+                 learnable_tgt_init=False,
+                 decoder_query_perturber=None,
+                 add_channel_attention=False,
+                 add_pos_value=False,
+                 random_refpoints_xy=False,
+                 # two stage
+                 two_stage_type='no',
+                 two_stage_pat_embed=0,
+                 two_stage_add_query_num=0,
+                 two_stage_learn_wh=False,
+                 two_stage_keep_all_tokens=False,
+                 # evo of #anchors
+                 dec_layer_number=None,
+                 rm_enc_query_scale=True,
+                 rm_dec_query_scale=True,
+                 rm_self_attn_layers=None,
+                 key_aware_type=None,
+                 # layer share
+                 layer_share_type=None,
+                 # for detach
+                 rm_detach=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 # for dn
+                 embed_init_tgt=False,
+                 use_detached_boxes_dec_out=False,
+                 use_text_enhancer=False,
+                 use_fusion_layer=False,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 use_text_cross_attention=False,
+                 text_dropout=0.1,
+                 fusion_dropout=0.1,
+                 fusion_droppath=0.0,
+                 binary_query_selection=False,
+                 ffn_extra_layernorm=False,
+                 ):
+        super().__init__()
+        self.num_feature_levels = num_feature_levels
+        self.num_encoder_layers = num_encoder_layers
+        self.num_unicoder_layers = num_unicoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.deformable_encoder = deformable_encoder
+        self.deformable_decoder = deformable_decoder
+        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens
+        self.num_queries = num_queries
+        self.random_refpoints_xy = random_refpoints_xy
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+        self.ffn_extra_layernorm = ffn_extra_layernorm
+        assert query_dim == 4
+        self.binary_query_selection = binary_query_selection
+        if self.binary_query_selection:
+            self.binary_query_selection_layer = nn.Linear(d_model, 1)
+        # assert not binary_query_selection, 'binary_query_selection not implemented yet'
+        if num_feature_levels > 1:
+            assert deformable_encoder, "only support deformable_encoder for num_feature_levels > 1"
+        if use_deformable_box_attn:
+            assert deformable_encoder or deformable_encoder
+        assert layer_share_type in [None, 'encoder', 'decoder', 'both']
+        if layer_share_type in ['encoder', 'both']:
+            enc_layer_share = True
+        else:
+            enc_layer_share = False
+        if layer_share_type in ['decoder', 'both']:
+            dec_layer_share = True
+        else:
+            dec_layer_share = False
+        assert layer_share_type is None
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+        # choose encoder layer type
+        if deformable_encoder:
+            encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, enc_n_points,
+                                                              add_channel_attention=add_channel_attention,
+                                                              use_deformable_box_attn=use_deformable_box_attn,
+                                                              box_attn_type=box_attn_type)
+        else:
+            raise NotImplementedError
+        if use_text_enhancer:
+            text_enhance_layer = TransformerEncoderLayer(
+                d_model=d_model,
+                nhead=nhead // 2,
+                dim_feedforward=dim_feedforward // 2,
+                dropout=text_dropout
+            )
+        else:
+            text_enhance_layer = None
+        if use_fusion_layer:
+            feature_fusion_layer = BiAttentionBlock(
+                v_dim=d_model,
+                l_dim=d_model,
+                embed_dim=dim_feedforward // 2,
+                num_heads=nhead // 2,
+                dropout=fusion_dropout,
+                drop_path=fusion_droppath
+            )
+        else:
+            feature_fusion_layer = None
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        assert encoder_norm is None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, d_model=d_model,
+            num_queries=num_queries,
+            enc_layer_share=enc_layer_share,
+            text_enhance_layer=text_enhance_layer,
+            feature_fusion_layer=feature_fusion_layer,
+            use_checkpoint=use_checkpoint,
+            use_transformer_ckpt=use_transformer_ckpt,
+        )
+        # choose decoder layer type
+        if deformable_decoder:
+            decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                              dropout, activation,
+                                                              num_feature_levels, nhead, dec_n_points,
+                                                              use_text_cross_attention=use_text_cross_attention,
+                                                              ffn_extra_layernorm=ffn_extra_layernorm, )
+        else:
+            raise NotImplementedError
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec,
+                                          d_model=d_model, query_dim=query_dim,
+                                          modulate_hw_attn=modulate_hw_attn,
+                                          num_feature_levels=num_feature_levels,
+                                          deformable_decoder=deformable_decoder,
+                                          decoder_query_perturber=decoder_query_perturber,
+                                          dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,
+                                          dec_layer_share=dec_layer_share,
+                                          use_detached_boxes_dec_out=use_detached_boxes_dec_out
+                                          )
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dec_layers = num_decoder_layers
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns
+        if not isinstance(num_patterns, int):
+            Warning("num_patterns should be int but {}".format(type(num_patterns)))
+            self.num_patterns = 0
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+        self.learnable_tgt_init = learnable_tgt_init
+        assert learnable_tgt_init, "why not learnable_tgt_init"
+        self.embed_init_tgt = embed_init_tgt
+        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+        # for two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_pat_embed = two_stage_pat_embed
+        self.two_stage_add_query_num = two_stage_add_query_num
+        self.two_stage_learn_wh = two_stage_learn_wh
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type == 'standard':
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            if two_stage_pat_embed > 0:
+                self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))
+                nn.init.normal_(self.pat_embed_for_2stage)
+            if two_stage_add_query_num > 0:
+                self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)
+            if two_stage_learn_wh:
+                # import ipdb; ipdb.set_trace()
+                self.two_stage_wh_embedding = nn.Embedding(1, 2)
+            else:
+                self.two_stage_wh_embedding = None
+        if two_stage_type == 'no':
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+        # evolution of anchors
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            if self.two_stage_type != 'no' or num_patterns == 0:
+                assert dec_layer_number[
+                           0] == num_queries, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})"
+            else:
+                assert dec_layer_number[
+                           0] == num_queries * num_patterns, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})"
+        self._reset_parameters()
+        self.rm_self_attn_layers = rm_self_attn_layers
+        if rm_self_attn_layers is not None:
+            # assert len(rm_self_attn_layers) == num_decoder_layers
+            print("Removing the self-attn in {} decoder layers".format(rm_self_attn_layers))
+            for lid, dec_layer in enumerate(self.decoder.layers):
+                if lid in rm_self_attn_layers:
+                    dec_layer.rm_self_attn_modules()
+        self.rm_detach = rm_detach
+        if self.rm_detach:
+            assert isinstance(rm_detach, list)
+            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
+        self.decoder.rm_detach = rm_detach
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+        if self.two_stage_learn_wh:
+            nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,
+                dn_meta=None,targets=None,kpt_embed=None):
+        """
+        Input:
+            - srcs: List of multi features [bs, ci, hi, wi]
+            - masks: List of multi masks [bs, hi, wi]
+            - refpoint_embed: [bs, num_dn, 4]. None in infer
+            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
+            - tgt: [bs, num_dn, d_model]. None in infer
+        """
+        # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:
+        #     assert refpoint_embed is None
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # two stage
+        enc_topk_proposals = enc_refpoint_embed = None
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, memory_text = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            # we ~ the mask . False means use the token; True means pad the token
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['text_self_attention_masks'],
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+        text_dict['encoded_text'] = memory_text
+        if self.two_stage_type == 'standard':
+            if self.two_stage_learn_wh:
+                input_hw = self.two_stage_wh_embedding.weight[0]
+            else:
+                input_hw = None
+            output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,
+                                                                           input_hw)
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+            if self.two_stage_pat_embed > 0:
+                bs, nhw, _ = output_memory.shape
+                # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256
+                output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)
+                _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)
+                output_memory = output_memory + _pats
+                output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)
+            if self.two_stage_add_query_num > 0:
+                assert refpoint_embed is not None
+                output_memory = torch.cat((output_memory, tgt), dim=1)
+                output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)
+            if self.binary_query_selection:
+                topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)
+            else:
+                if text_dict is not None:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+                else:
+                    enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+                topk_logits = enc_outputs_class_unselected.max(-1)[0]
+            enc_outputs_coord_unselected = self.enc_out_bbox_embed(
+                output_memory) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
+                                                   topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(output_proposals, 1,
+                                             topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()  # sigmoid
+            # gather tgt
+            tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+            if self.embed_init_tgt:
+                tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+        elif self.two_stage_type == 'no':
+            tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)  # nq, bs, 4
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,
+                                                                             1)  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+            init_box_proposal = refpoint_embed_.sigmoid()
+        else:
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+        #########################################################
+        # End preparing tgt
+        # - tgt: bs, NQ, d_model
+        # - refpoint_embed(unsigmoid): bs, NQ, d_model
+        #########################################################
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+        #     if tgt.isnan().any() | tgt.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios, tgt_mask=attn_mask,
+            tgt_mask2=attn_mask2,
+            memory_text=text_dict['encoded_text'],
+            text_attention_mask=~text_dict['text_token_mask'],
+            text_dict=text_dict,
+            dn_meta=dn_meta,
+            targets=targets,
+            kpt_embed=kpt_embed
+            # we ~ the mask . False means use the token; True means pad the token
+        )
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == 'standard':
+            if self.two_stage_keep_all_tokens:
+                hs_enc = output_memory.unsqueeze(0)
+                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
+                init_box_proposal = output_proposals
+                # import ipdb; ipdb.set_trace()
+            else:
+                hs_enc = tgt_undetach.unsqueeze(0)
+                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        else:
+            hs_enc = ref_enc = None
+        #########################################################
+        # End postprocess
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
+        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
+        #########################################################
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+        # hs: (n_dec, bs, nq, d_model)
+        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
+        # ref_enc: sigmoid coordinates. \
+        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
+class TransformerEncoder(nn.Module):
+    def __init__(self,
+                 encoder_layer, num_layers, d_model=256,
+                 num_queries=300,
+                 enc_layer_share=False,
+                 text_enhance_layer=None,
+                 feature_fusion_layer=None,
+                 use_checkpoint=False,
+                 use_transformer_ckpt=False,
+                 ):
+        """_summary_
+        Args:
+            encoder_layer (_type_): _description_
+            num_layers (_type_): _description_
+            norm (_type_, optional): _description_. Defaults to None.
+            d_model (int, optional): _description_. Defaults to 256.
+            num_queries (int, optional): _description_. Defaults to 300.
+            enc_layer_share (bool, optional): _description_. Defaults to False.
+        """
+        super().__init__()
+        # prepare layers
+        self.layers = []
+        self.text_layers = []
+        self.fusion_layers = []
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+            if text_enhance_layer is not None:
+                self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
+            if feature_fusion_layer is not None:
+                self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
+        else:
+            self.layers = []
+            del encoder_layer
+            if text_enhance_layer is not None:
+                self.text_layers = []
+                del text_enhance_layer
+            if feature_fusion_layer is not None:
+                self.fusion_layers = []
+                del feature_fusion_layer
+        self.query_scale = None
+        self.num_queries = num_queries
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self,
+                # for images
+                src: Tensor,
+                pos: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                key_padding_mask: Tensor,
+                # for texts
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None,
+                ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+            - memory_text: bs, n_text, 256
+            - text_attention_mask: bs, n_text
+                False for no padding; True for padding
+            - pos_text: bs, n_text, 256
+            - position_ids: bs, n_text
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+        output = src
+        # preparation and reshape
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, text_dim = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,
+                                                                                                                     1,
+                                                                                                                     1)
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # if output.isnan().any() or memory_text.isnan().any():
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            if self.fusion_layers:
+                if self.use_checkpoint:
+                    output, memory_text = checkpoint.checkpoint(
+                        self.fusion_layers[layer_id],
+                        output,
+                        memory_text,
+                        key_padding_mask,
+                        text_attention_mask
+                    )
+                else:
+                    output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,
+                                                                       attention_mask_v=key_padding_mask,
+                                                                       attention_mask_l=text_attention_mask)
+            if self.text_layers:
+                memory_text = self.text_layers[layer_id](
+                    src=memory_text.transpose(0, 1),
+                    src_mask=~text_self_attention_masks,  # note we use ~ for mask here
+                    src_key_padding_mask=text_attention_mask,
+                    pos=(pos_text.transpose(0, 1) if pos_text is not None else None)
+                ).transpose(0, 1)
+            # main process
+            if self.use_transformer_ckpt:
+                output = checkpoint.checkpoint(
+                    layer,
+                    output,
+                    pos,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    key_padding_mask
+                )
+            else:
+                output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,
+                               level_start_index=level_start_index, key_padding_mask=key_padding_mask)
+        return output, memory_text
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None,
+                 return_intermediate=False,
+                 d_model=256, query_dim=4,
+                 modulate_hw_attn=False,
+                 num_feature_levels=1,
+                 deformable_decoder=False,
+                 decoder_query_perturber=None,
+                 dec_layer_number=None,  # number of queries each layer in decoder
+                 rm_dec_query_scale=False,
+                 dec_layer_share=False,
+                 dec_layer_dropout_prob=None,
+                 use_detached_boxes_dec_out=False,
+                 num_box_decoder_layers=2,
+                 num_body_points=68,
+                 ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+        self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+        if rm_dec_query_scale:
+            self.query_scale = None
+        else:
+            raise NotImplementedError
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+        self.bbox_embed = None
+        self.class_embed = None
+        self.pose_embed = None
+        self.pose_hw_embed = None
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+        self.decoder_query_perturber = decoder_query_perturber
+        self.box_pred_damping = None
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+        self.rm_detach = None
+        self.num_body_points = num_body_points
+        self.hw = nn.Embedding(17, 2)
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        self.hw_append = nn.Embedding(self.num_body_points-17, 2)
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_mask2: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+                # for memory
+                level_start_index: Optional[Tensor] = None,  # num_levels
+                spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                valid_ratios: Optional[Tensor] = None,
+                # for text
+                memory_text: Optional[Tensor] = None,
+                text_attention_mask: Optional[Tensor] = None,
+                text_dict: Optional[Tensor] = None,
+                dn_meta: Optional[Tensor] = None,
+                targets: Optional[Tensor] = None,
+                kpt_embed: Optional[Tensor] = None
+                ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+        output = tgt
+        output += self.hw.weight[0, 0] * 0.0
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+        effect_num_dn = dn_meta['pad_size'] if self.training else 0
+        inter_select_number = 50
+        for layer_id, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([valid_ratios, valid_ratios], -1)[None, :]  # nq, bs, nlevel, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :])  # nq, bs, 256*2
+            # conditional query
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if query_pos.isnan().any() | query_pos.isinf().any():
+            #         import ipdb; ipdb.set_trace()
+            # main process
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+                memory_text=memory_text,
+                text_attention_mask=text_attention_mask,
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask
+            )
+            if output.isnan().any() | output.isinf().any():
+                print(f"output layer_id {layer_id} is nan")
+                try:
+                    num_nan = output.isnan().sum().item()
+                    num_inf = output.isinf().sum().item()
+                    print(f"num_nan {num_nan}, num_inf {num_inf}")
+                except Exception as e:
+                    print(e)
+            intermediate.append(self.norm(output))
+            # iter update
+            if layer_id < self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+            # select # ref points as anchors
+            if layer_id == self.num_box_decoder_layers - 1:
+                dn_output = output[:effect_num_dn]
+                dn_new_reference_points = new_reference_points[:effect_num_dn]
+                class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,
+                                   effect_num_dn:].transpose(0, 1)
+                topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]
+                new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,
+                                                            topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+                new_output_for_box = torch.gather(output[effect_num_dn:], 0,
+                                                  topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
+                keypoint_embed=kpt_embed.transpose(0, 1)
+                new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)
+                delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]
+                keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()
+                num_queries, _, bs, _ = keypoint_xy.shape
+                aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)
+                keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()
+                keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]
+                new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)
+                output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)
+                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)
+                output = torch.cat((dn_output, output), dim=0)
+                tgt_mask = tgt_mask2
+            if layer_id >= self.num_box_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                output_bbox_dn = output[:effect_num_dn]
+                output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]
+                reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][
+                                                     0::(self.num_body_points + 1)]
+                delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)
+                delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)
+                outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn
+                outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm
+                new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()
+                new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()
+                output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))
+                delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,
+                                                                                                      device=output.device)).clone()  ##
+                delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)
+                outputs_unsig[..., :2] += delta_xy_unsig[..., :2]
+                outputs_unsig[..., 2:] += delta_hw_unsig
+                new_reference_points_for_keypoint = outputs_unsig.sigmoid()
+                bs = new_reference_points_for_box_norm.shape[1]
+                new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),
+                                                       new_reference_points_for_keypoint.view(-1, self.num_body_points,
+                                                                                              bs, 4)), dim=1).flatten(0,
+                                                                                                                      1)
+                new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)
+            if self.rm_detach and 'dec' in self.rm_detach:
+                reference_points = new_reference_points
+            else:
+                reference_points = new_reference_points.detach()
+            # if layer_id != self.num_layers - 1:
+            if self.use_detached_boxes_dec_out:
+                ref_points.append(reference_points)
+            else:
+                ref_points.append(new_reference_points)
+        return [
+            [itm_out.transpose(0, 1) for itm_out in intermediate],
+            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
+        ]
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,
+                              key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+        return src
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_text_feat_guide=False,
+                 use_text_cross_attention=False,
+                 ffn_extra_layernorm=False
+                 ):
+        super().__init__()
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm1 = nn.LayerNorm(d_model)
+        # cross attention text
+        if use_text_cross_attention:
+            self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+            self.catext_norm = nn.LayerNorm(d_model)
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm3 = nn.LayerNorm(d_model)
+        if ffn_extra_layernorm:
+            raise NotImplementedError('ffn_extra_layernorm not implemented')
+            self.norm_ext = nn.LayerNorm(d_ffn)
+        else:
+            self.norm_ext = None
+        self.key_aware_proj = None
+        self.use_text_feat_guide = use_text_feat_guide
+        assert not use_text_feat_guide
+        self.use_text_cross_attention = use_text_cross_attention
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, tgt, ipdb_flag=False):
+        with torch.cuda.amp.autocast(enabled=False):
+            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+                memory_text: Optional[Tensor] = None,  # bs, num_token, d_model
+                text_attention_mask: Optional[Tensor] = None,  # bs, num_token
+                # for memory
+                memory: Optional[Tensor] = None,  # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None,  # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None,  # pos for memory
+                # sa
+                self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention
+                ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        assert cross_attn_mask is None
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if tgt.isnan().any() | tgt.isinf().any() :
+            #         import ipdb; ipdb.set_trace()
+        if self.use_text_cross_attention:
+            tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),
+                                memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]
+            tgt = tgt + self.catext_dropout(tgt2)
+            tgt = self.catext_norm(tgt)
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            # if tgt.isnan().any() | tgt.isinf().any() :
+            #     import ipdb; ipdb.set_trace()
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
+                               memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     tgtk = tgt.clone()
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         import ipdb; ipdb.set_trace()
+        # ffn
+        tgt = self.forward_ffn(tgt)
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if tgt.isnan().any() | tgt.isinf().any() :
+        #         tgtk = self.forward_ffn(tgtk, ipdb_flag=True)
+        #         import ipdb; ipdb.set_trace()
+        return tgt
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_deformable_transformer(args):
+    decoder_query_perturber = None
+    if args.decoder_layer_noise:
+        from .utils import RandomBoxPerturber
+        decoder_query_perturber = RandomBoxPerturber(
+            x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,
+            w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)
+    use_detached_boxes_dec_out = False
+    try:
+        use_detached_boxes_dec_out = args.use_detached_boxes_dec_out
+    except:
+        use_detached_boxes_dec_out = False
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+    ffn_extra_layernorm = False
+    try:
+        ffn_extra_layernorm = args.ffn_extra_layernorm
+    except:
+        print('ffn_extra_layernorm not found, set to False')
+        ffn_extra_layernorm = False
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_unicoder_layers=args.unic_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        modulate_hw_attn=True,
+        deformable_encoder=True,
+        deformable_decoder=True,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        use_deformable_box_attn=args.use_deformable_box_attn,
+        box_attn_type=args.box_attn_type,
+        learnable_tgt_init=True,
+        decoder_query_perturber=decoder_query_perturber,
+        add_channel_attention=args.add_channel_attention,
+        add_pos_value=args.add_pos_value,
+        random_refpoints_xy=args.random_refpoints_xy,
+        # two stage
+        two_stage_type=args.two_stage_type,  # ['no', 'standard', 'early']
+        two_stage_pat_embed=args.two_stage_pat_embed,
+        two_stage_add_query_num=args.two_stage_add_query_num,
+        two_stage_learn_wh=args.two_stage_learn_wh,
+        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
+        dec_layer_number=args.dec_layer_number,
+        rm_self_attn_layers=None,
+        key_aware_type=None,
+        layer_share_type=None,
+        rm_detach=None,
+        decoder_sa_type=args.decoder_sa_type,
+        module_seq=args.decoder_module_seq,
+        embed_init_tgt=args.embed_init_tgt,
+        use_detached_boxes_dec_out=use_detached_boxes_dec_out,
+        use_text_enhancer=args.use_text_enhancer,
+        use_fusion_layer=args.use_fusion_layer,
+        use_checkpoint=args.use_checkpoint,
+        use_transformer_ckpt=args.use_transformer_ckpt,
+        use_text_cross_attention=args.use_text_cross_attention,
+        text_dropout=args.text_dropout,
+        fusion_dropout=args.fusion_dropout,
+        fusion_droppath=args.fusion_droppath,
+        binary_query_selection=binary_query_selection,
+        ffn_extra_layernorm=ffn_extra_layernorm,
+    )

python/utils/dependencies/XPose/models/UniPose/fuse_modules.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from timm.models.layers import DropPath
+from src.modules.util import DropPath
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+def l1norm(X, dim, eps=1e-8):
+    """L1-normalize columns of X
+    """
+    norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
+    X = torch.div(X, norm)
+    return X
+def l2norm(X, dim, eps=1e-8):
+    """L2-normalize columns of X
+    """
+    norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
+    X = torch.div(X, norm)
+    return X
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
+    """
+    query: (n_context, queryL, d)
+    context: (n_context, sourceL, d)
+    """
+    batch_size_q, queryL = query.size(0), query.size(1)
+    batch_size, sourceL = context.size(0), context.size(1)
+    # Get attention
+    # --> (batch, d, queryL)
+    queryT = torch.transpose(query, 1, 2)
+    # (batch, sourceL, d)(batch, d, queryL)
+    # --> (batch, sourceL, queryL)
+    attn = torch.bmm(context, queryT)
+    if raw_feature_norm == "softmax":
+        # --> (batch*sourceL, queryL)
+        attn = attn.view(batch_size * sourceL, queryL)
+        attn = nn.Softmax()(attn)
+        # --> (batch, sourceL, queryL)
+        attn = attn.view(batch_size, sourceL, queryL)
+    elif raw_feature_norm == "l2norm":
+        attn = l2norm(attn, 2)
+    elif raw_feature_norm == "clipped_l2norm":
+        attn = nn.LeakyReLU(0.1)(attn)
+        attn = l2norm(attn, 2)
+    else:
+        raise ValueError("unknown first norm type:", raw_feature_norm)
+    # --> (batch, queryL, sourceL)
+    attn = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch*queryL, sourceL)
+    attn = attn.view(batch_size * queryL, sourceL)
+    attn = nn.Softmax()(attn * smooth)
+    # --> (batch, queryL, sourceL)
+    attn = attn.view(batch_size, queryL, sourceL)
+    # --> (batch, sourceL, queryL)
+    attnT = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch, d, sourceL)
+    contextT = torch.transpose(context, 1, 2)
+    # (batch x d x sourceL)(batch x sourceL x queryL)
+    # --> (batch, d, queryL)
+    weightedContext = torch.bmm(contextT, attnT)
+    # --> (batch, queryL, d)
+    weightedContext = torch.transpose(weightedContext, 1, 2)
+    return weightedContext, attnT
+class BiMultiHeadAttention(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
+        super(BiMultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+        assert (
+                self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+        self._reset_parameters()
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        """_summary_
+        Args:
+            v (_type_): bs, n_img, dim
+            l (_type_): bs, n_text, dim
+            attention_mask_v (_type_, optional): _description_. bs, n_img
+            attention_mask_l (_type_, optional): _description_. bs, n_text
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = v.size()
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+        if self.clamp_min_for_underflow:
+            attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
+            0])
+        if self.clamp_min_for_underflow:
+            attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range
+        # mask vison for language
+        if attention_mask_v is not None:
+            attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+        # mask language for vision
+        if attention_mask_l is not None:
+            attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights.masked_fill_(attention_mask_l, float('-inf'))
+        attn_weights_v = attn_weights.softmax(dim=-1)
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
+            )
+        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
+            )
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+        return attn_output_v, attn_output_l
+# Bi-Direction MHA (text->image, image->text)
+class BiAttentionBlock(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
+                 drop_path=.0, init_values=1e-4, cfg=None):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlock, self).__init__()
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(v_dim=v_dim,
+                                         l_dim=l_dim,
+                                         embed_dim=embed_dim,
+                                         num_heads=num_heads,
+                                         dropout=dropout)
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)
+        self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.gamma_v * delta_v)
+        l = l + self.drop_path(self.gamma_l * delta_l)
+        return v, l

python/utils/dependencies/XPose/models/UniPose/mask_generate.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+def prepare_for_mask(kpt_mask):
+    tgt_size2 = 50 * 69
+    attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0
+    group_bbox_kpt = 69
+    num_group=50
+    for matchj in range(num_group * group_bbox_kpt):
+        sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+        ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+        if sj > 0:
+            attn_mask2[:,:,matchj, :sj] = True
+        if ej < num_group * group_bbox_kpt:
+            attn_mask2[:,:,matchj, ej:] = True
+    bs, length = kpt_mask.shape
+    equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]
+    equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)
+    for idx in range(num_group):
+        start_idx = idx * length
+        end_idx = (idx + 1) * length
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False
+        attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True
+    input_query_label = None
+    input_query_bbox = None
+    attn_mask = None
+    dn_meta = None
+    return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta
+def post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
+    if dn_meta and dn_meta['pad_size'] > 0:
+        output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]
+        output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]
+        outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]
+        outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]
+        out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
+        if aux_loss:
+            out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
+        dn_meta['output_known_lbs_bboxes'] = out
+    return outputs_class, outputs_coord

python/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from .ms_deform_attn_func import MSDeformAttnFunction

python/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+import MultiScaleDeformableAttention as MSDA
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()

python/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from .ms_deform_attn import MSDeformAttn

python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import warnings
+import math, os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+from src.utils.dependencies.XPose.models.UniPose.ops.functions.ms_deform_attn_func import MSDeformAttnFunction
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+        self.im2col_step = 64
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+        self.use_4D_normalizer = use_4D_normalizer
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        # for amp
+        if value.dtype == torch.float16:
+            # for mixed precision
+            output = MSDeformAttnFunction.apply(
+            value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
+            output = output.to(torch.float16)
+            output = self.output_proj(output)
+            return output
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output

python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import warnings
+import math, os
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+try:
+    from src.utils.dependencies.XPose.models.UniPose.ops.functions import MSDeformAttnFunction
+except:
+    warnings.warn('Failed to import MSDeformAttnFunction.')
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+        self.im2col_step = 64
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+        self.use_4D_normalizer = use_4D_normalizer
+        self._reset_parameters()
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+    def forward(self, query, key, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param key                          (N, 1, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output

python/utils/dependencies/XPose/models/UniPose/ops/setup.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+import os
+import glob
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+from setuptools import find_packages
+from setuptools import setup
+requirements = ["torch", "torchvision"]
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+    # import ipdb; ipdb.set_trace()
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)

python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,41 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}

python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h ADDED Viewed

	@@ -0,0 +1,33 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);

python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu ADDED Viewed

	@@ -0,0 +1,153 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+        }));
+    }
+    output = output.view({batch, num_query, num_heads*channels});
+    return output;
+}
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+    const int num_levels = spatial_shapes.size(0);
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+    const int im2col_step_ = std::min(batch, im2col_step);
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+        }));
+    }
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}

python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h ADDED Viewed

	@@ -0,0 +1,30 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);

python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh ADDED Viewed

	@@ -0,0 +1,1327 @@

+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        __syncthreads();
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+    const scalar_t top_grad = grad_col[index];
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes,
+                              const int64_t* data_level_start_index,
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}

python/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h ADDED Viewed

	@@ -0,0 +1,62 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include "cpu/ms_deform_attn_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}

python/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp ADDED Viewed

	@@ -0,0 +1,16 @@

+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "ms_deform_attn.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}

python/utils/dependencies/XPose/models/UniPose/ops/test.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+torch.manual_seed(3)
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)

python/utils/dependencies/XPose/models/UniPose/position_encoding.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+from util.misc import NestedTensor
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            # if os.environ.get("SHILONG_AMP", None) == '1':
+            #     eps = 1e-4
+            # else:
+            #     eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        # import ipdb; ipdb.set_trace()
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        # import ipdb; ipdb.set_trace()
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps,
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True
+        )
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

python/utils/dependencies/XPose/models/UniPose/swin_transformer.py ADDED Viewed

	@@ -0,0 +1,701 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from util.misc import NestedTensor
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from src.modules.util import DropPath, to_2tuple, trunc_normal_
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 dilation=False,
+                 use_checkpoint=False):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+        # if use_checkpoint:
+        #     print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                # dim=int(embed_dim * 2 ** i_layer),
+                dim=num_features[i_layer],
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            # import ipdb; ipdb.set_trace()
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # out:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        # collect for nesttensors
+        outs_dict = {}
+        for idx, out_i in enumerate(outs):
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+            outs_dict[idx] = NestedTensor(out_i, mask)
+        return outs_dict
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+def build_swin_transformer(modelname, pretrain_img_size, **kw):
+    assert modelname in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']
+    model_para_dict = {
+        'swin_T_224_1k': dict(
+            embed_dim=96,
+            depths=[ 2, 2, 6, 2 ],
+            num_heads=[ 3, 6, 12, 24],
+            window_size=7
+        ),
+        'swin_B_224_22k': dict(
+            embed_dim=128,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 4, 8, 16, 32 ],
+            window_size=7
+        ),
+        'swin_B_384_22k': dict(
+            embed_dim=128,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 4, 8, 16, 32 ],
+            window_size=12
+        ),
+        'swin_L_224_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 6, 12, 24, 48 ],
+            window_size=7
+        ),
+        'swin_L_384_22k': dict(
+            embed_dim=192,
+            depths=[ 2, 2, 18, 2 ],
+            num_heads=[ 6, 12, 24, 48 ],
+            window_size=12
+        ),
+    }
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
+    return model
+if __name__ == "__main__":
+    model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)
+    x = torch.rand(2, 3, 1024, 1024)
+    y = model.forward_raw(x)
+    import ipdb; ipdb.set_trace()
+    x = torch.rand(2, 3, 384, 384)
+    y = model.forward_raw(x)

python/utils/dependencies/XPose/models/UniPose/transformer_deformable.py ADDED Viewed

	@@ -0,0 +1,595 @@

+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import copy
+import math
+import torch
+from torch import nn, Tensor
+from torch.nn.init import xavier_uniform_, constant_, normal_
+from typing import Optional
+from util.misc import inverse_sigmoid
+from .ops.modules import MSDeformAttn
+from .utils import MLP, _get_activation_fn, gen_sineembed_for_position
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", return_intermediate_dec=False,
+                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4,
+                 two_stage=False, two_stage_num_proposals=300,
+                 use_dab=False, high_dim_query_update=False, no_sine_embed=False):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.use_dab = use_dab
+        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, enc_n_points)
+        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
+        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels, nhead, dec_n_points)
+        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec,
+                                                            use_dab=use_dab, d_model=d_model, high_dim_query_update=high_dim_query_update, no_sine_embed=no_sine_embed)
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        if two_stage:
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(d_model * 2)
+        else:
+            if not self.use_dab:
+                self.reference_points = nn.Linear(d_model, 2)
+        self.high_dim_query_update = high_dim_query_update
+        if high_dim_query_update:
+            assert not self.use_dab, "use_dab must be True"
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if not self.two_stage and not self.use_dab:
+            xavier_uniform_(self.reference_points.weight.data, gain=1.0)
+            constant_(self.reference_points.bias.data, 0.)
+        normal_(self.level_embed)
+    def get_proposal_pos_embed(self, proposals):
+        num_pos_feats = 128
+        temperature = 10000
+        scale = 2 * math.pi
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
+        N_, S_, C_ = memory.shape
+        base_scale = 4.0
+        proposals = []
+        _cur = 0
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+            proposals.append(proposal)
+            _cur += (H_ * W_)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def forward(self, srcs, masks, pos_embeds, query_embed=None):
+        """
+        Input:
+            - srcs: List([bs, c, h, w])
+            - masks: List([bs, h, w])
+        """
+        assert self.two_stage or query_embed is not None
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)                # bs, hw, c
+            mask = mask.flatten(1)                              # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)    # bs, hw, c
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)     # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)   # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+        # import ipdb; ipdb.set_trace()
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        if self.two_stage:
+            output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+            # hack implementation for two-stage Deformable DETR
+            enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
+            enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
+        elif self.use_dab:
+            reference_points = query_embed[..., self.d_model:].sigmoid()
+            tgt = query_embed[..., :self.d_model]
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            init_reference_out = reference_points
+        else:
+            query_embed, tgt = torch.split(query_embed, c, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
+            tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+                # bs, num_quires, 2
+            init_reference_out = reference_points
+        # decoder
+        # import ipdb; ipdb.set_trace()
+        hs, inter_references = self.decoder(tgt, reference_points, memory,
+                                            spatial_shapes, level_start_index, valid_ratios,
+                                            query_pos=query_embed if not self.use_dab else None,
+                                            src_padding_mask=mask_flatten)
+        inter_references_out = inter_references
+        if self.two_stage:
+            return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact
+        return hs, init_reference_out, inter_references_out, None, None
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 add_channel_attention=False,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 ):
+        super().__init__()
+        # self attention
+        if use_deformable_box_attn:
+            self.self_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
+        else:
+            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # channel attention
+        self.add_channel_attention = add_channel_attention
+        if add_channel_attention:
+            self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
+            self.norm_channel = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        # channel attn
+        if self.add_channel_attention:
+            src = self.norm_channel(src + self.activ_channel(src))
+        return src
+class DeformableTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers)
+        else:
+            self.layers = []
+            del encoder_layer
+        self.num_layers = num_layers
+        self.norm = norm
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - padding_mask: [bs, sum(hi*wi)]
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_lebel, 2]
+        """
+        output = src
+        # bs, sum(hi*wi), 256
+        # import ipdb; ipdb.set_trace()
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4,
+                 use_deformable_box_attn=False,
+                 box_attn_type='roi_align',
+                 key_aware_type=None,
+                 decoder_sa_type='ca',
+                 module_seq=['sa', 'ca', 'ffn'],
+                 ):
+        super().__init__()
+        self.module_seq = module_seq
+        assert sorted(module_seq) == ['ca', 'ffn', 'sa']
+        # cross attention
+        # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        if use_deformable_box_attn:
+            self.cross_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
+        else:
+            self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.key_aware_type = key_aware_type
+        self.key_aware_proj = None
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+        if decoder_sa_type == 'ca_content':
+            self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward_sa(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            if self.decoder_sa_type == 'sa':
+                q = k = self.with_pos_embed(tgt, tgt_query_pos)
+                tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            elif self.decoder_sa_type == 'ca_label':
+                # import ipdb; ipdb.set_trace()
+                # q = self.with_pos_embed(tgt, tgt_query_pos)
+                bs = tgt.shape[1]
+                k = v = self.label_embedding.weight[:, None, :].repeat(1, bs, 1)
+                tgt2 = self.self_attn(tgt, k, v, attn_mask=self_attn_mask)[0]
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            elif self.decoder_sa_type == 'ca_content':
+                tgt2 = self.self_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                            tgt_reference_points.transpose(0, 1).contiguous(),
+                            memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+                tgt = tgt + self.dropout2(tgt2)
+                tgt = self.norm2(tgt)
+            else:
+                raise NotImplementedError("Unknown decoder_sa_type {}".format(self.decoder_sa_type))
+        return tgt
+    def forward_ca(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        # cross attention
+        # import ipdb; ipdb.set_trace()
+        if self.key_aware_type is not None:
+            if self.key_aware_type == 'mean':
+                tgt = tgt + memory.mean(0, keepdim=True)
+            elif self.key_aware_type == 'proj_mean':
+                tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
+            else:
+                raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+                               tgt_reference_points.transpose(0, 1).contiguous(),
+                               memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        return tgt
+    def forward(self,
+                # for tgt
+                tgt: Optional[Tensor],  # nq, bs, d_model
+                tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
+                tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
+                # for memory
+                memory: Optional[Tensor] = None, # hw, bs, d_model
+                memory_key_padding_mask: Optional[Tensor] = None,
+                memory_level_start_index: Optional[Tensor] = None, # num_levels
+                memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
+                memory_pos: Optional[Tensor] = None, # pos for memory
+                # sa
+                self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
+                cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
+            ):
+        for funcname in self.module_seq:
+            # if os.environ.get('IPDB_DEBUG_SHILONG') == 'INFO':
+            #     import ipdb; ipdb.set_trace()
+            if funcname == 'ffn':
+                tgt = self.forward_ffn(tgt)
+            elif funcname == 'ca':
+                tgt = self.forward_ca(tgt, tgt_query_pos, tgt_query_sine_embed, \
+                    tgt_key_padding_mask, tgt_reference_points, \
+                        memory, memory_key_padding_mask, memory_level_start_index, \
+                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
+            elif funcname == 'sa':
+                tgt = self.forward_sa(tgt, tgt_query_pos, tgt_query_sine_embed, \
+                    tgt_key_padding_mask, tgt_reference_points, \
+                        memory, memory_key_padding_mask, memory_level_start_index, \
+                            memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
+            else:
+                raise ValueError('unknown funcname {}'.format(funcname))
+        return tgt
+class DeformableTransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False, use_dab=False, d_model=256, query_dim=4):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+        assert return_intermediate
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.use_dab = use_dab
+        self.d_model = d_model
+        self.query_dim = query_dim
+        if use_dab:
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+            self.ref_point_head = MLP(2 * d_model, d_model, d_model, 2)
+    def forward(self, tgt, reference_points, src, src_spatial_shapes,
+                src_level_start_index, src_valid_ratios,
+                query_pos=None, src_padding_mask=None):
+        output = tgt
+        if self.use_dab:
+            assert query_pos is None
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for layer_id, layer in enumerate(self.layers):
+            # import ipdb; ipdb.set_trace()
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] # bs, nq, 4, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
+            if self.use_dab:
+                # import ipdb; ipdb.set_trace()
+                query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # bs, nq, 256*2
+                raw_query_pos = self.ref_point_head(query_sine_embed) # bs, nq, 256
+                pos_scale = self.query_scale(output) if layer_id != 0 else 1
+                query_pos = pos_scale * raw_query_pos
+            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                box_holder = self.bbox_embed(output)
+                box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
+                new_reference_points = box_holder[..., :self.query_dim].sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_id != self.num_layers - 1:
+                    intermediate_reference_points.append(new_reference_points)
+            intermediate.append(output)
+        return torch.stack(intermediate), torch.stack(intermediate_reference_points)
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_deforamble_transformer(args):
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        return_intermediate_dec=True,
+        num_feature_levels=args.ddetr_num_feature_levels,
+        dec_n_points=args.ddetr_dec_n_points,
+        enc_n_points=args.ddetr_enc_n_points,
+        two_stage=args.ddetr_two_stage,
+        two_stage_num_proposals=args.num_queries,
+        use_dab=args.ddetr_use_dab,
+        high_dim_query_update=args.ddetr_high_dim_query_update,
+        no_sine_embed=args.ddetr_no_sine_embed)

python/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import torch
+from torch import Tensor, nn
+from typing import List, Optional
+from .utils import  _get_activation_fn, _get_clones
+class TextTransformer(nn.Module):
+    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
+        super().__init__()
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.nheads = nheads
+        self.dim_feedforward = dim_feedforward
+        self.norm = None
+        single_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout)
+        self.layers = _get_clones(single_encoder_layer, num_layers)
+    def forward(self, memory_text:torch.Tensor, text_attention_mask:torch.Tensor):
+        """
+        Args:
+            text_attention_mask: bs, num_token
+            memory_text: bs, num_token, d_model
+        Raises:
+            RuntimeError: _description_
+        Returns:
+            output: bs, num_token, d_model
+        """
+        output = memory_text.transpose(0, 1)
+        for layer in self.layers:
+            output = layer(output, src_key_padding_mask=text_attention_mask)
+        if self.norm is not None:
+            output = self.norm(output)
+        return output.transpose(0, 1)
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self.nhead = nhead
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        # repeat attn mask
+        if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
+            # bs, num_q, num_k
+            src_mask = src_mask.repeat(self.nhead, 1, 1)
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
+        # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src

python/utils/dependencies/XPose/models/UniPose/unipose.py ADDED Viewed

	@@ -0,0 +1,621 @@

+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+import os
+import copy
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import List
+from util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
+from util.misc import NestedTensor, nested_tensor_from_tensor_list,inverse_sigmoid
+from .utils import MLP
+from .backbone import build_backbone
+from ..registry import MODULE_BUILD_FUNCS
+from .mask_generate import prepare_for_mask, post_process
+from .deformable_transformer import build_deformable_transformer
+class UniPose(nn.Module):
+    """ This is the Cross-Attention Detector module that performs object detection """
+    def __init__(self, backbone, transformer, num_classes, num_queries,
+                 aux_loss=False, iter_update=False,
+                 query_dim=2,
+                 random_refpoints_xy=False,
+                 fix_refpoints_hw=-1,
+                 num_feature_levels=1,
+                 nheads=8,
+                 # two stage
+                 two_stage_type='no',  # ['no', 'standard']
+                 two_stage_add_query_num=0,
+                 dec_pred_class_embed_share=True,
+                 dec_pred_bbox_embed_share=True,
+                 two_stage_class_embed_share=True,
+                 two_stage_bbox_embed_share=True,
+                 decoder_sa_type='sa',
+                 num_patterns=0,
+                 dn_number=100,
+                 dn_box_noise_scale=0.4,
+                 dn_label_noise_ratio=0.5,
+                 dn_labelbook_size=100,
+                 use_label_enc=True,
+                 text_encoder_type='bert-base-uncased',
+                 binary_query_selection=False,
+                 use_cdn=True,
+                 sub_sentence_present=True,
+                 num_body_points=68,
+                 num_box_decoder_layers=2,
+                 ):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+            fix_refpoints_hw: -1(default): learn w and h for each box seperately
+                                >0 : given fixed number
+                                -2 : learn a shared w and h
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.use_label_enc = use_label_enc
+        if use_label_enc:
+            self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        else:
+            raise NotImplementedError
+            self.label_enc = None
+        self.max_text_len = 256
+        self.binary_query_selection = binary_query_selection
+        self.sub_sentence_present = sub_sentence_present
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy
+        self.fix_refpoints_hw = fix_refpoints_hw
+        # for dn training
+        self.num_patterns = num_patterns
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_labelbook_size = dn_labelbook_size
+        self.use_cdn = use_cdn
+        self.projection = MLP(512, hidden_dim, hidden_dim, 3)
+        self.projection_kpt = MLP(512, hidden_dim, hidden_dim, 3)
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # model, _ = clip.load("ViT-B/32", device=device)
+        # self.clip_model = model
+        # visual_parameters = list(self.clip_model.visual.parameters())
+        # #
+        # for param in visual_parameters:
+        #     param.requires_grad = False
+        self.pos_proj = nn.Linear(hidden_dim, 768)
+        self.padding = nn.Embedding(1, 768)
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(nn.Sequential(
+                    nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                    nn.GroupNorm(32, hidden_dim),
+                ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', "two_stage_type should be no if num_feature_levels=1 !!!"
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )])
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+        self.iter_update = iter_update
+        assert iter_update, "Why not iter_update?"
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
+        # prepare class & box embed
+        _class_embed = ContrastiveAssign()
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
+        if dec_pred_bbox_embed_share:
+            box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
+        else:
+            box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)]
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
+        else:
+            class_embed_layerlist = [copy.deepcopy(_class_embed) for i in range(transformer.num_decoder_layers)]
+        if dec_pred_bbox_embed_share:
+            pose_embed_layerlist = [_pose_embed for i in
+                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
+        else:
+            pose_embed_layerlist = [copy.deepcopy(_pose_embed) for i in
+                                    range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
+        pose_hw_embed_layerlist = [_pose_hw_embed for i in
+                                   range(transformer.num_decoder_layers - num_box_decoder_layers)]
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.bbox_embed = nn.ModuleList(box_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+        self.num_body_points = num_body_points
+        self.pose_embed = nn.ModuleList(pose_embed_layerlist)
+        self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+        self.transformer.decoder.pose_embed = self.pose_embed
+        self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
+        self.transformer.decoder.num_body_points = num_body_points
+        # two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_add_query_num = two_stage_add_query_num
+        assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
+            self.refpoint_embed = None
+            if self.two_stage_add_query_num > 0:
+                self.init_ref_points(two_stage_add_query_num)
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+        # self.replace_sa_with_double_ca = replace_sa_with_double_ca
+        if decoder_sa_type == 'ca_label':
+            self.label_embedding = nn.Embedding(num_classes, hidden_dim)
+            for layer in self.transformer.decoder.layers:
+                layer.label_embedding = self.label_embedding
+        else:
+            for layer in self.transformer.decoder.layers:
+                layer.label_embedding = None
+            self.label_embedding = None
+        self._reset_parameters()
+    def open_set_transfer_init(self):
+        for name, param in self.named_parameters():
+            if 'fusion_layers' in name:
+                continue
+            if 'ca_text' in name:
+                continue
+            if 'catext_norm' in name:
+                continue
+            if 'catext_dropout' in name:
+                continue
+            if "text_layers" in name:
+                continue
+            if 'bert' in name:
+                continue
+            if 'bbox_embed' in name:
+                continue
+            if 'label_enc.weight' in name:
+                continue
+            if 'feat_map' in name:
+                continue
+            if 'enc_output' in name:
+                continue
+            param.requires_grad_(False)
+        # import ipdb; ipdb.set_trace()
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
+        if self.random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+        if self.fix_refpoints_hw > 0:
+            print("fix_refpoints_hw: {}".format(self.fix_refpoints_hw))
+            assert self.random_refpoints_xy
+            self.refpoint_embed.weight.data[:, 2:] = self.fix_refpoints_hw
+            self.refpoint_embed.weight.data[:, 2:] = inverse_sigmoid(self.refpoint_embed.weight.data[:, 2:])
+            self.refpoint_embed.weight.data[:, 2:].requires_grad = False
+        elif int(self.fix_refpoints_hw) == -1:
+            pass
+        elif int(self.fix_refpoints_hw) == -2:
+            print('learn a shared h and w')
+            assert self.random_refpoints_xy
+            self.refpoint_embed = nn.Embedding(use_num_queries, 2)
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+            self.hw_embed = nn.Embedding(1, 1)
+        else:
+            raise NotImplementedError('Unknown fix_refpoints_hw {}'.format(self.fix_refpoints_hw))
+    def forward(self, samples: NestedTensor, targets: List = None, **kw):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, width, height). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        captions = [t['instance_text_prompt'] for t in targets]
+        bs=len(captions)
+        tensor_list = [tgt["object_embeddings_text"] for tgt in targets]
+        max_size = 350
+        padded_tensors = [torch.cat([tensor, torch.zeros(max_size - tensor.size(0), tensor.size(1),device=tensor.device)]) if tensor.size(0) < max_size else tensor for tensor in tensor_list]
+        object_embeddings_text = torch.stack(padded_tensors)
+        kpts_embeddings_text = torch.stack([tgt["kpts_embeddings_text"] for tgt in targets])[:, :self.num_body_points]
+        encoded_text=self.projection(object_embeddings_text) # bs, 81, 101, 256
+        kpt_embeddings_specific=self.projection_kpt(kpts_embeddings_text) # bs, 81, 101, 256
+        kpt_vis = torch.stack([tgt["kpt_vis_text"] for tgt in targets])[:, :self.num_body_points]
+        kpt_mask = torch.cat((torch.ones_like(kpt_vis, device=kpt_vis.device)[..., 0].unsqueeze(-1), kpt_vis), dim=-1)
+        num_classes = encoded_text.shape[1] # bs, 81, 101, 256
+        text_self_attention_masks = torch.eye(num_classes).unsqueeze(0).expand(bs, -1, -1).bool().to(samples.device)
+        text_token_mask = torch.zeros(samples.shape[0],num_classes).to(samples.device)>0
+        for i in range(bs):
+            text_token_mask[i,:len(captions[i])]=True
+        position_ids = torch.zeros(samples.shape[0], num_classes).to(samples.device)
+        for i in range(bs):
+            position_ids[i,:len(captions[i])]= 1
+        text_dict = {
+            'encoded_text': encoded_text, # bs, 195, d_model
+            'text_token_mask': text_token_mask, # bs, 195
+            'position_ids': position_ids, # bs, 195
+            'text_self_attention_masks': text_self_attention_masks # bs, 195,195
+        }
+        # import ipdb; ipdb.set_trace()
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, poss = self.backbone(samples)
+        if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            import ipdb;
+            ipdb.set_trace()
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+        if self.label_enc is not None:
+            label_enc = self.label_enc
+        else:
+            raise NotImplementedError
+            label_enc = encoded_text
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask, attn_mask2, dn_meta = \
+                prepare_for_mask(kpt_mask=kpt_mask)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = dn_meta = None
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(srcs, masks, input_query_bbox, poss,
+                                                                                 input_query_label, attn_mask, attn_mask2,
+                                                                                 text_dict, dn_meta,targets,kpt_embeddings_specific)
+        # In case num object=0
+        if self.label_enc is not None:
+            hs[0] += self.label_enc.weight[0, 0] * 0.0
+        hs[0] += self.pos_proj.weight[0, 0] * 0.0
+        hs[0] += self.pos_proj.bias[0] * 0.0
+        hs[0] += self.padding.weight[0, 0] * 0.0
+        num_group = 50
+        effective_dn_number = dn_meta['pad_size'] if self.training else 0
+        outputs_coord_list = []
+        outputs_class = []
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_cls_embed, layer_hs) in enumerate(
+                zip(reference[:-1], self.bbox_embed, self.class_embed, hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                layer_delta_unsig = layer_bbox_embed(layer_hs)
+                layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_outputs_unsig = layer_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs, text_dict)
+                outputs_coord_list.append(layer_outputs_unsig)
+                outputs_class.append(layer_cls)
+            else:
+                layer_hs_bbox_dn = layer_hs[:, :effective_dn_number, :]
+                layer_hs_bbox_norm = layer_hs[:, effective_dn_number:, :][:, 0::(self.num_body_points + 1), :]
+                bs = layer_ref_sig.shape[0]
+                reference_before_sigmoid_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]
+                reference_before_sigmoid_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][:,
+                                                     0::(self.num_body_points + 1), :]
+                layer_delta_unsig_dn = layer_bbox_embed(layer_hs_bbox_dn)
+                layer_delta_unsig_norm = layer_bbox_embed(layer_hs_bbox_norm)
+                layer_outputs_unsig_dn = layer_delta_unsig_dn + inverse_sigmoid(reference_before_sigmoid_bbox_dn)
+                layer_outputs_unsig_dn = layer_outputs_unsig_dn.sigmoid()
+                layer_outputs_unsig_norm = layer_delta_unsig_norm + inverse_sigmoid(reference_before_sigmoid_bbox_norm)
+                layer_outputs_unsig_norm = layer_outputs_unsig_norm.sigmoid()
+                layer_outputs_unsig = torch.cat((layer_outputs_unsig_dn, layer_outputs_unsig_norm), dim=1)
+                layer_cls_dn = layer_cls_embed(layer_hs_bbox_dn, text_dict)
+                layer_cls_norm = layer_cls_embed(layer_hs_bbox_norm, text_dict)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+                outputs_class.append(layer_cls)
+                outputs_coord_list.append(layer_outputs_unsig)
+        # update keypoints
+        outputs_keypoints_list = []
+        outputs_keypoints_hw = []
+        kpt_index = [x for x in range(num_group * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_res = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points * 3))
+                outputs_keypoints_list.append(layer_res)
+            else:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_kpt = layer_hs[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
+                                                                                                 device=layer_hs.device))
+                delta_xy_unsig = self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_kpt)
+                layer_ref_sig_kpt = layer_ref_sig[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
+                                                                                                           device=layer_hs.device))
+                layer_outputs_unsig_keypoints = delta_xy_unsig + inverse_sigmoid(layer_ref_sig_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(layer_outputs_unsig_keypoints,
+                                               device=layer_outputs_unsig_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_keypoints, vis_xy_unsig[:, :, 0].unsqueeze(-1)), dim=-1)
+                xyv = xyv.sigmoid()
+                layer_res = xyv.reshape((bs, num_group, self.num_body_points, 3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_kpt[..., 2:].reshape(bs, num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_keypoints_list.append(layer_res)
+                outputs_keypoints_hw.append(layer_hw)
+        if self.dn_number > 0 and dn_meta is not None:
+            outputs_class, outputs_coord_list = \
+                post_process(outputs_class, outputs_coord_list,
+                                dn_meta, self.aux_loss, self._set_aux_loss)
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord_list[-1],
+               'pred_keypoints': outputs_keypoints_list[-1]}
+        return out
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='UniPose')
+def build_unipose(args):
+    num_classes = args.num_classes
+    device = torch.device(args.device)
+    backbone = build_backbone(args)
+    transformer = build_deformable_transformer(args)
+    try:
+        match_unstable_error = args.match_unstable_error
+        dn_labelbook_size = args.dn_labelbook_size
+    except:
+        match_unstable_error = True
+        dn_labelbook_size = num_classes
+    try:
+        dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    except:
+        dec_pred_class_embed_share = True
+    try:
+        dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+    except:
+        dec_pred_bbox_embed_share = True
+    binary_query_selection = False
+    try:
+        binary_query_selection = args.binary_query_selection
+    except:
+        binary_query_selection = False
+    use_cdn = True
+    try:
+        use_cdn = args.use_cdn
+    except:
+        use_cdn = True
+    sub_sentence_present = True
+    try:
+        sub_sentence_present = args.sub_sentence_present
+    except:
+        sub_sentence_present = True
+    # print('********* sub_sentence_present', sub_sentence_present)
+    model = UniPose(
+        backbone,
+        transformer,
+        num_classes=num_classes,
+        num_queries=args.num_queries,
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,
+        fix_refpoints_hw=args.fix_refpoints_hw,
+        num_feature_levels=args.num_feature_levels,
+        nheads=args.nheads,
+        dec_pred_class_embed_share=dec_pred_class_embed_share,
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,
+        # two stage
+        two_stage_type=args.two_stage_type,
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,
+        two_stage_class_embed_share=args.two_stage_class_embed_share,
+        decoder_sa_type=args.decoder_sa_type,
+        num_patterns=args.num_patterns,
+        dn_number=args.dn_number if args.use_dn else 0,
+        dn_box_noise_scale=args.dn_box_noise_scale,
+        dn_label_noise_ratio=args.dn_label_noise_ratio,
+        dn_labelbook_size=dn_labelbook_size,
+        use_label_enc=args.use_label_enc,
+        text_encoder_type=args.text_encoder_type,
+        binary_query_selection=binary_query_selection,
+        use_cdn=use_cdn,
+        sub_sentence_present=sub_sentence_present
+    )
+    return model
+class ContrastiveAssign(nn.Module):
+    def __init__(self, project=False, cal_bias=None, max_text_len=256):
+        """
+        :param x: query
+        :param y: text embed
+        :param proj:
+        :return:
+        """
+        super().__init__()
+        self.project = project
+        self.cal_bias = cal_bias
+        self.max_text_len = max_text_len
+    def forward(self, x, text_dict):
+        """_summary_
+        Args:
+            x (_type_): _description_
+            text_dict (_type_): _description_
+            {
+                'encoded_text': encoded_text, # bs, 195, d_model
+                'text_token_mask': text_token_mask, # bs, 195
+                        # True for used tokens. False for padding tokens
+            }
+        Returns:
+            _type_: _description_
+        """
+        assert isinstance(text_dict, dict)
+        y = text_dict['encoded_text']
+        max_text_len = y.shape[1]
+        text_token_mask = text_dict['text_token_mask']
+        if self.cal_bias is not None:
+            raise NotImplementedError
+            return x @ y.transpose(-1, -2) + self.cal_bias.weight.repeat(x.shape[0], x.shape[1], 1)
+        res = x @ y.transpose(-1, -2)
+        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
+        # padding to max_text_len
+        new_res = torch.full((*res.shape[:-1], max_text_len), float('-inf'), device=res.device)
+        new_res[..., :res.shape[-1]] = res
+        return new_res

python/utils/dependencies/XPose/models/UniPose/utils.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+import copy
+import torch
+import random
+from torch import nn, Tensor
+import os
+import numpy as np
+import math
+import torch.nn.functional as F
+from torch import nn
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def get_sine_pos_embed(
+        pos_tensor: torch.Tensor,
+        num_pos_feats: int = 128,
+        temperature: int = 10000,
+        exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
+def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    base_scale = 4.0
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+        # import ipdb; ipdb.set_trace()
+        grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                        torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        if learnedwh is not None:
+            # import ipdb; ipdb.set_trace()
+            wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
+        else:
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
+        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        # wh = torch.ones_like(grid) / scale
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += (H_ * W_)
+    # import ipdb; ipdb.set_trace()
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals / (1 - output_proposals))  # unsigmoid
+    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+    output_memory = memory
+    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
+    return output_memory, output_proposals
+class RandomBoxPerturber():
+    def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
+        self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
+    def __call__(self, refanchors: Tensor) -> Tensor:
+        nq, bs, query_dim = refanchors.shape
+        device = refanchors.device
+        noise_raw = torch.rand_like(refanchors)
+        noise_scale = self.noise_scale.to(device)[:query_dim]
+        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
+        return new_refanchors.clamp_(0, 1)
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    if no_reduction:
+        return loss
+    return loss.mean(1).sum() / num_boxes
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
+    sigmas = kpt_preds.new_tensor(sigmas)
+    variances = (sigmas * 2) ** 2
+    assert kpt_preds.size(0) == kpt_gts.size(0)
+    kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
+    kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
+    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
+                       (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
+    # import pdb
+    # pdb.set_trace()
+    # assert (kpt_valids.sum(-1) > 0).all()
+    squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
+    squared_distance1 = torch.exp(-squared_distance0)
+    squared_distance1 = squared_distance1 * kpt_valids
+    oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)
+    return oks
+def oks_loss(pred,
+             target,
+             valid=None,
+             area=None,
+             linear=False,
+             sigmas=None,
+             eps=1e-6):
+    """Oks loss.
+    Computing the oks loss between a set of predicted poses and target poses.
+    The loss is calculated as negative log of oks.
+    Args:
+        pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
+            shape (n, 2K).
+        target (torch.Tensor): Corresponding gt poses, shape (n, 2K).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        eps (float): Eps to avoid log(0).
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
+    if linear:
+        loss = 1 - oks
+    else:
+        loss = -oks.log()
+    return loss
+class OKSLoss(nn.Module):
+    """IoULoss.
+    Computing the oks loss between a set of predicted poses and target poses.
+    Args:
+        linear (bool): If True, use linear scale of loss instead of log scale.
+            Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+    def __init__(self,
+                 linear=False,
+                 num_keypoints=17,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(OKSLoss, self).__init__()
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        if num_keypoints == 68:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+                .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
+            ], dtype=np.float32) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+    def forward(self,
+                pred,
+                target,
+                valid,
+                area,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            valid (torch.Tensor): The visible flag of the target pose.
+            area (torch.Tensor): The area of the target pose.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * oks_loss(
+            pred,
+            target,
+            valid=valid,
+            area=area,
+            linear=self.linear,
+            sigmas=self.sigmas,
+            eps=self.eps)
+        return loss

python/utils/dependencies/XPose/models/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# ------------------------------------------------------------------------
+# ED-Pose
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .UniPose.unipose import build_unipose
+def build_model(args):
+    # we use register to maintain models from catdet6 on.
+    from .registry import MODULE_BUILD_FUNCS
+    assert args.modelname in MODULE_BUILD_FUNCS._module_dict
+    build_func = MODULE_BUILD_FUNCS.get(args.modelname)
+    model = build_func(args)
+    return model

python/utils/dependencies/XPose/models/registry.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# -*- coding: utf-8 -*-
+# @Author: Yihao Chen
+# @Date:   2021-08-16 16:03:17
+# @Last Modified by:   Shilong Liu
+# @Last Modified time: 2022-01-23 15:26
+# modified from mmcv
+import inspect
+from functools import partial
+class Registry(object):
+    def __init__(self, name):
+        self._name = name
+        self._module_dict = dict()
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, list(self._module_dict.keys()))
+        return format_str
+    def __len__(self):
+        return len(self._module_dict)
+    @property
+    def name(self):
+        return self._name
+    @property
+    def module_dict(self):
+        return self._module_dict
+    def get(self, key):
+        return self._module_dict.get(key, None)
+    def registe_with_name(self, module_name=None, force=False):
+        return partial(self.register, module_name=module_name, force=force)
+    def register(self, module_build_function, module_name=None, force=False):
+        """Register a module build function.
+        Args:
+            module (:obj:`nn.Module`): Module to be registered.
+        """
+        if not inspect.isfunction(module_build_function):
+            raise TypeError('module_build_function must be a function, but got {}'.format(
+                type(module_build_function)))
+        if module_name is None:
+            module_name = module_build_function.__name__
+        if not force and module_name in self._module_dict:
+            raise KeyError('{} is already registered in {}'.format(
+                module_name, self.name))
+        self._module_dict[module_name] = module_build_function
+        return module_build_function
+MODULE_BUILD_FUNCS = Registry('model build functions')

python/utils/dependencies/XPose/predefined_keypoints.py ADDED Viewed

	@@ -0,0 +1,56 @@

+person = {"keypoints":['nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'],"skeleton": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]}
+face = {"keypoints": ['right cheekbone 1', 'right cheekbone 2', 'right cheek 1', 'right cheek 2', 'right cheek 3', 'right cheek 4', 'right cheek 5', 'right chin', 'chin center', 'left chin', 'left cheek 5', 'left cheek 4', 'left cheek 3', 'left cheek 2', 'left cheek 1', 'left cheekbone 2', 'left cheekbone 1', 'right eyebrow 1', 'right eyebrow 2', 'right eyebrow 3', 'right eyebrow 4', 'right eyebrow 5', 'left eyebrow 1', 'left eyebrow 2', 'left eyebrow 3', 'left eyebrow 4', 'left eyebrow 5', 'nasal bridge 1', 'nasal bridge 2', 'nasal bridge 3', 'nasal bridge 4', 'right nasal wing 1', 'right nasal wing 2', 'nasal wing center', 'left nasal wing 1', 'left nasal wing 2', 'right eye eye corner 1', 'right eye upper eyelid 1', 'right eye upper eyelid 2', 'right eye eye corner 2', 'right eye lower eyelid 2', 'right eye lower eyelid 1', 'left eye eye corner 1', 'left eye upper eyelid 1', 'left eye upper eyelid 2', 'left eye eye corner 2', 'left eye lower eyelid 2', 'left eye lower eyelid 1', 'right mouth corner', 'upper lip outer edge 1', 'upper lip outer edge 2', 'upper lip outer edge 3', 'upper lip outer edge 4', 'upper lip outer edge 5', 'left mouth corner', 'lower lip outer edge 5', 'lower lip outer edge 4', 'lower lip outer edge 3', 'lower lip outer edge 2', 'lower lip outer edge 1', 'upper lip inter edge 1', 'upper lip inter edge 2', 'upper lip inter edge 3', 'upper lip inter edge 4', 'upper lip inter edge 5', 'lower lip inter edge 3', 'lower lip inter edge 2', 'lower lip inter edge 1'], "skeleton": []}
+hand = {"keypoints":['wrist', 'thumb root', "thumb's third knuckle", "thumb's second knuckle", 'thumb’s first knuckle', "forefinger's root", "forefinger's third knuckle", "forefinger's second knuckle", "forefinger's first knuckle", "middle finger's root", "middle finger's third knuckle", "middle finger's second knuckle", "middle finger's first knuckle", "ring finger's root", "ring finger's third knuckle", "ring finger's second knuckle", "ring finger's first knuckle", "pinky finger's root", "pinky finger's third knuckle", "pinky finger's second knuckle", "pinky finger's first knuckle"],"skeleton": []}
+animal_in_AnimalKindom = {"keypoints":['head mid top', 'eye left', 'eye right', 'mouth front top', 'mouth back left', 'mouth back right', 'mouth front bottom', 'shoulder left', 'shoulder right', 'elbow left', 'elbow right', 'wrist left', 'wrist right', 'torso mid back', 'hip left', 'hip right', 'knee left', 'knee right', 'ankle left ', 'ankle right', 'tail top back', 'tail mid back', 'tail end back'],"skeleton": [[1, 0], [2, 0], [3, 4], [3, 5], [4, 6], [5, 6], [0, 7], [0, 8], [7, 9], [8, 10], [9, 11], [10, 12], [0, 13], [13, 20], [20, 14], [20, 15], [14, 16], [15, 17], [16, 18], [17, 19], [20, 21], [21, 22]]}
+animal_in_AP10K = {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
+animal= {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
+animal_face = {"keypoints": ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip'], "skeleton": []}
+fly = {"keypoints": ['head', 'eye left', 'eye right', 'neck', 'thorax', 'abdomen', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'wing left', 'wing right'], "skeleton": [[2, 1], [3, 1], [4, 1], [5, 4], [6, 5], [8, 7], [9, 8], [10, 9], [12, 11], [13, 12], [14, 13], [16, 15], [17, 16], [18, 17], [20, 19], [21, 20], [22, 21], [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], [30, 29], [31, 4], [32, 4]]}
+locust = {"keypoints": ['head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttip left', 'antbase left', 'eye left', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'anttip right', 'antbase right', 'eye right', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip'],"skeleton": [[2, 1], [3, 2], [4, 3], [5, 4], [7, 6], [8, 7], [10, 9], [11, 10], [12, 11], [14, 13], [15, 14],[16, 15], [18, 17], [19, 18], [20, 19], [22, 21], [23, 22], [25, 24], [26, 25], [27, 26],[29, 28], [30, 29], [31, 30], [33, 32], [34, 33], [35, 34]]}
+car ={"keypoints": ['right front wheel center', 'left front wheel center', 'right rear wheel center', 'left rear wheel center', 'front right', 'front left', 'back right', 'back left', 'none', 'roof front right', 'roof front left', 'roof back right', 'roof back left', 'none'],"skeleton": [[0, 2], [1, 3], [0, 1], [2, 3], [9, 11], [10, 12], [9, 10], [11, 12], [4, 0], [4, 9], [4, 5], [5, 1], [5, 10], [6, 2], [6, 11], [7, 3], [7, 12], [6, 7]]}
+short_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+long_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'right sleeve inside 3', 'right sleeve inside 4', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 4', 'left sleeve inside 3', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
+short_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
+sling={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
+vest = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
+long_sleeved_dress={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'center hem', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+long_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+trousers = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right side outside 3', 'right cuff outside', 'right cuff inside', 'right side inside 1', 'crotch', 'left side inside 1', 'left cuff inside', 'left cuff outside', 'left side outside 3', 'left side outside 2'], 'skeleton': []}
+sling_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
+vest_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
+skirt = {'keypoints': ['right side 1', 'upper center', 'left side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2'], 'skeleton': []}
+short_sleeved_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'left side 1', 'left side 2', 'left side 3', 'left side 4', 'left side 5', 'center hem', 'right side 5', 'right side 4', 'right side 3', 'right side 2', 'right side 1', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
+shorts = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right cuff outside', 'right cuff inside', 'crotch', 'left cuff inside', 'left cuff outside', 'left side outside 2'], 'skeleton': []}
+table = {'keypoints': ['desktop corner 1', 'desktop corner 2', 'desktop corner 3', 'desktop corner 4', 'table leg 1', 'table leg 2', 'table leg 3', 'table leg 4'], 'skeleton': []}
+chair = {'keypoints': ['legs righttopcorner', 'legs lefttopcorner', 'legs leftbottomcorner', 'legs rightbottomcorner', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'headboard righttop', 'headboard lefttop'], 'skeleton': []}
+bed = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'backrest lefttop'], 'skeleton': []}
+sofa = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'armrests rightbottomcorner', 'armrests righttopcorner', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'armrests leftbottomcorner', 'armrests lefttopcorner', 'backrest lefttop'], 'skeleton': []}
+swivelchair = {'keypoints': ['rotatingbase 1', 'rotatingbase 2', 'rotatingbase 3', 'rotatingbase 4', 'rotatingbase 5', 'rotatingbase center', 'base center', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'backrest righttop', 'backrest lefttop'], 'skeleton': []}

python/utils/dependencies/XPose/transforms.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import os
+import sys
+import random
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from util.box_ops import box_xyxy_to_cxcywh
+from util.misc import interpolate
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+    if target is not None:
+        target = target.copy()
+        i, j, h, w = region
+        id2catname = target["id2catname"]
+        caption_list = target["caption_list"]
+        target["size"] = torch.tensor([h, w])
+        fields = ["labels", "area", "iscrowd", "positive_map","keypoints"]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+            cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+            cropped_boxes = cropped_boxes.clamp(min=0)
+            area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+            target["boxes"] = cropped_boxes.reshape(-1, 4)
+            target["area"] = area
+            fields.append("boxes")
+        if "masks" in target:
+            # FIXME should we update the area here if there are no boxes?
+            target['masks'] = target['masks'][:, i:i + h, j:j + w]
+            fields.append("masks")
+        # remove elements for which the boxes or masks that have zero area
+        if "boxes" in target or "masks" in target:
+            # favor boxes selection when defining which elements to keep
+            # this is compatible with previous implementation
+            if "boxes" in target:
+                cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+                keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+            else:
+                keep = target['masks'].flatten(1).any(1)
+            for field in fields:
+                if field in target:
+                    target[field] = target[field][keep]
+        if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            # for debug and visualization only.
+            if 'strings_positive' in target:
+                target['strings_positive'] = [_i for _i, _j in zip(target['strings_positive'], keep) if _j]
+        if "keypoints" in target:
+            max_size = torch.as_tensor([w, h], dtype=torch.float32)
+            keypoints = target["keypoints"]
+            cropped_keypoints = keypoints.view(-1, 3)[:,:2] - torch.as_tensor([j, i])
+            cropped_keypoints = torch.min(cropped_keypoints, max_size)
+            cropped_keypoints = cropped_keypoints.clamp(min=0)
+            cropped_keypoints = torch.cat([cropped_keypoints, keypoints.view(-1, 3)[:,2].unsqueeze(1)], dim=1)
+            target["keypoints"] = cropped_keypoints.view(target["keypoints"].shape[0], target["keypoints"].shape[1], 3)
+        target["id2catname"] = id2catname
+        target["caption_list"] = caption_list
+    return cropped_image, target
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+    w, h = image.size
+    if target is not None:
+        target = target.copy()
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+            target["boxes"] = boxes
+        if "masks" in target:
+            target['masks'] = target['masks'].flip(-1)
+        if "keypoints" in target:
+            dataset_name=target["dataset_name"]
+            if dataset_name == "coco_person" or dataset_name == "macaque":
+                flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],
+                                   [9, 10], [11, 12], [13, 14], [15, 16]]
+            elif dataset_name=="animalkindom_ak_P1_animal":
+                flip_pairs = [[1, 2], [4, 5],[7,8],[9,10],[11,12],[14,15],[16,17],[18,19]]
+            elif dataset_name=="animalweb_animal":
+                flip_pairs = [[0, 3], [1, 2], [5, 6]]
+            elif dataset_name=="face":
+                flip_pairs = [
+                                [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
+                                [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
+                                [31, 35], [32, 34],
+                                [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
+                                [48, 54], [49, 53], [50, 52],
+                                [55, 59], [56, 58],
+                                [60, 64], [61, 63],
+                                [65, 67]
+                            ]
+            elif dataset_name=="hand":
+                flip_pairs = []
+            elif dataset_name=="foot":
+                flip_pairs = []
+            elif dataset_name=="locust":
+                flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
+            elif dataset_name=="fly":
+                flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]]
+            elif dataset_name == "ap_36k_animal" or dataset_name == "ap_10k_animal":
+                flip_pairs = [[0, 1],[5, 8], [6, 9], [7, 10], [11, 14], [12, 15], [13, 16]]
+            keypoints = target["keypoints"]
+            keypoints[:,:,0] = w - keypoints[:,:, 0]-1
+            for pair in flip_pairs:
+                keypoints[:,pair[0], :], keypoints[:,pair[1], :] = keypoints[:,pair[1], :], keypoints[:,pair[0], :].clone()
+            target["keypoints"] = keypoints
+    return flipped_image, target
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+        return (oh, ow)
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+    if target is None:
+        return rescaled_image, None
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+        target["boxes"] = scaled_boxes
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+    if "keypoints" in target:
+        keypoints = target["keypoints"]
+        scaled_keypoints = keypoints * torch.as_tensor([ratio_width, ratio_height, 1])
+        target["keypoints"] = scaled_keypoints
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+    if "masks" in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+    return rescaled_image, target
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+class ResizeDebug(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        return resize(img, target, self.size)
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
+        # respect_boxes:    True to keep all boxes
+        #                   False to tolerence box filter
+        self.min_size = min_size
+        self.max_size = max_size
+        self.respect_boxes = respect_boxes
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        init_boxes = len(target["boxes"]) if (target is not None and "boxes" in target) else 0
+        max_patience = 10
+        for i in range(max_patience):
+            w = random.randint(self.min_size, min(img.width, self.max_size))
+            h = random.randint(self.min_size, min(img.height, self.max_size))
+            region = T.RandomCrop.get_params(img, [h, w])
+            result_img, result_target = crop(img, target, region)
+            if target is not None:
+                if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1:
+                    return result_img, result_target
+        return result_img, result_target
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+    def __call__(self, img, target):
+        return self.eraser(img), target
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        if "area" in target:
+            area = target["area"]
+            area = area / (torch.tensor(w, dtype=torch.float32)*torch.tensor(h, dtype=torch.float32))
+            target["area"] = area
+        if "keypoints" in target:
+            keypoints = target["keypoints"]
+            V = keypoints[:, :, 2]
+            V[V == 2] = 1
+            Z=keypoints[:, :, :2]
+            Z = Z.contiguous().view(-1, 2 * V.shape[-1])
+            Z = Z / torch.tensor([w, h] * V.shape[-1], dtype=torch.float32)
+            target["valid_kpt_num"] = V.shape[1]
+            Z_pad = torch.zeros(Z.shape[0],68 * 2 - Z.shape[1])
+            V_pad = torch.zeros(V.shape[0],68 - V.shape[1])
+            V=torch.cat([V, V_pad], dim=1)
+            Z=torch.cat([Z, Z_pad], dim=1)
+            all_keypoints = torch.cat([Z, V], dim=1)
+            target["keypoints"] = all_keypoints
+        return image, target
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string

python/utils/dependencies/XPose/util/addict.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import copy
+class Dict(dict):
+    def __init__(__self, *args, **kwargs):
+        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
+        object.__setattr__(__self, '__key', kwargs.pop('__key', None))
+        object.__setattr__(__self, '__frozen', False)
+        for arg in args:
+            if not arg:
+                continue
+            elif isinstance(arg, dict):
+                for key, val in arg.items():
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
+                __self[arg[0]] = __self._hook(arg[1])
+            else:
+                for key, val in iter(arg):
+                    __self[key] = __self._hook(val)
+        for key, val in kwargs.items():
+            __self[key] = __self._hook(val)
+    def __setattr__(self, name, value):
+        if hasattr(self.__class__, name):
+            raise AttributeError("'Dict' object attribute "
+                                 "'{0}' is read-only".format(name))
+        else:
+            self[name] = value
+    def __setitem__(self, name, value):
+        isFrozen = (hasattr(self, '__frozen') and
+                    object.__getattribute__(self, '__frozen'))
+        if isFrozen and name not in super(Dict, self).keys():
+                raise KeyError(name)
+        super(Dict, self).__setitem__(name, value)
+        try:
+            p = object.__getattribute__(self, '__parent')
+            key = object.__getattribute__(self, '__key')
+        except AttributeError:
+            p = None
+            key = None
+        if p is not None:
+            p[key] = self
+            object.__delattr__(self, '__parent')
+            object.__delattr__(self, '__key')
+    def __add__(self, other):
+        if not self.keys():
+            return other
+        else:
+            self_type = type(self).__name__
+            other_type = type(other).__name__
+            msg = "unsupported operand type(s) for +: '{}' and '{}'"
+            raise TypeError(msg.format(self_type, other_type))
+    @classmethod
+    def _hook(cls, item):
+        if isinstance(item, dict):
+            return cls(item)
+        elif isinstance(item, (list, tuple)):
+            return type(item)(cls._hook(elem) for elem in item)
+        return item
+    def __getattr__(self, item):
+        return self.__getitem__(item)
+    def __missing__(self, name):
+        if object.__getattribute__(self, '__frozen'):
+            raise KeyError(name)
+        return self.__class__(__parent=self, __key=name)
+    def __delattr__(self, name):
+        del self[name]
+    def to_dict(self):
+        base = {}
+        for key, value in self.items():
+            if isinstance(value, type(self)):
+                base[key] = value.to_dict()
+            elif isinstance(value, (list, tuple)):
+                base[key] = type(value)(
+                    item.to_dict() if isinstance(item, type(self)) else
+                    item for item in value)
+            else:
+                base[key] = value
+        return base
+    def copy(self):
+        return copy.copy(self)
+    def deepcopy(self):
+        return copy.deepcopy(self)
+    def __deepcopy__(self, memo):
+        other = self.__class__()
+        memo[id(self)] = other
+        for key, value in self.items():
+            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
+        return other
+    def update(self, *args, **kwargs):
+        other = {}
+        if args:
+            if len(args) > 1:
+                raise TypeError()
+            other.update(args[0])
+        other.update(kwargs)
+        for k, v in other.items():
+            if ((k not in self) or
+                (not isinstance(self[k], dict)) or
+                (not isinstance(v, dict))):
+                self[k] = v
+            else:
+                self[k].update(v)
+    def __getnewargs__(self):
+        return tuple(self.items())
+    def __getstate__(self):
+        return self
+    def __setstate__(self, state):
+        self.update(state)
+    def __or__(self, other):
+        if not isinstance(other, (Dict, dict)):
+            return NotImplemented
+        new = Dict(self)
+        new.update(other)
+        return new
+    def __ror__(self, other):
+        if not isinstance(other, (Dict, dict)):
+            return NotImplemented
+        new = Dict(other)
+        new.update(self)
+        return new
+    def __ior__(self, other):
+        self.update(other)
+        return self
+    def setdefault(self, key, default=None):
+        if key in self:
+            return self[key]
+        else:
+            self[key] = default
+            return default
+    def freeze(self, shouldFreeze=True):
+        object.__setattr__(self, '__frozen', shouldFreeze)
+        for key, val in self.items():
+            if isinstance(val, Dict):
+                val.freeze(shouldFreeze)
+    def unfreeze(self):
+        self.freeze(False)