qqc1989 commited on
Commit
ed861ec
·
verified ·
1 Parent(s): c694ad9

Upload 114 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. python/axmodels/feature_extractor.axmodel +3 -0
  3. python/axmodels/motion_extractor.axmodel +3 -0
  4. python/axmodels/spade_generator.axmodel +3 -0
  5. python/axmodels/stitching_retargeting.axmodel +3 -0
  6. python/axmodels/warp.onnx +3 -0
  7. python/cropper.py +242 -0
  8. python/infer.py +894 -0
  9. python/infer_onnx.py +952 -0
  10. python/requirements.txt +9 -0
  11. python/utils/__init__.py +0 -0
  12. python/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  13. python/utils/__pycache__/crop.cpython-310.pyc +0 -0
  14. python/utils/__pycache__/human_landmark_runner.cpython-310.pyc +0 -0
  15. python/utils/__pycache__/rprint.cpython-310.pyc +0 -0
  16. python/utils/__pycache__/timer.cpython-310.pyc +0 -0
  17. python/utils/crop.py +423 -0
  18. python/utils/dependencies/XPose/config_model/UniPose_SwinT.py +125 -0
  19. python/utils/dependencies/XPose/config_model/coco_transformer.py +8 -0
  20. python/utils/dependencies/XPose/models/UniPose/__init__.py +10 -0
  21. python/utils/dependencies/XPose/models/UniPose/attention.py +373 -0
  22. python/utils/dependencies/XPose/models/UniPose/backbone.py +211 -0
  23. python/utils/dependencies/XPose/models/UniPose/deformable_transformer.py +1230 -0
  24. python/utils/dependencies/XPose/models/UniPose/fuse_modules.py +274 -0
  25. python/utils/dependencies/XPose/models/UniPose/mask_generate.py +56 -0
  26. python/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py +10 -0
  27. python/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py +61 -0
  28. python/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py +9 -0
  29. python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py +142 -0
  30. python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py +130 -0
  31. python/utils/dependencies/XPose/models/UniPose/ops/setup.py +73 -0
  32. python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp +41 -0
  33. python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h +33 -0
  34. python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu +153 -0
  35. python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h +30 -0
  36. python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh +1327 -0
  37. python/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h +62 -0
  38. python/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp +16 -0
  39. python/utils/dependencies/XPose/models/UniPose/ops/test.py +89 -0
  40. python/utils/dependencies/XPose/models/UniPose/position_encoding.py +157 -0
  41. python/utils/dependencies/XPose/models/UniPose/swin_transformer.py +701 -0
  42. python/utils/dependencies/XPose/models/UniPose/transformer_deformable.py +595 -0
  43. python/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py +102 -0
  44. python/utils/dependencies/XPose/models/UniPose/unipose.py +621 -0
  45. python/utils/dependencies/XPose/models/UniPose/utils.py +348 -0
  46. python/utils/dependencies/XPose/models/__init__.py +16 -0
  47. python/utils/dependencies/XPose/models/registry.py +58 -0
  48. python/utils/dependencies/XPose/predefined_keypoints.py +56 -0
  49. python/utils/dependencies/XPose/transforms.py +394 -0
  50. python/utils/dependencies/XPose/util/addict.py +159 -0
.gitattributes CHANGED
@@ -52,3 +52,4 @@ assets/examples/source/s5.jpg filter=lfs diff=lfs merge=lfs -text
52
  assets/examples/source/s6.jpg filter=lfs diff=lfs merge=lfs -text
53
  assets/examples/source/s7.jpg filter=lfs diff=lfs merge=lfs -text
54
  assets/examples/source/s9.jpg filter=lfs diff=lfs merge=lfs -text
 
 
52
  assets/examples/source/s6.jpg filter=lfs diff=lfs merge=lfs -text
53
  assets/examples/source/s7.jpg filter=lfs diff=lfs merge=lfs -text
54
  assets/examples/source/s9.jpg filter=lfs diff=lfs merge=lfs -text
55
+ python/utils/dependencies/insightface/data/images/t1.jpg filter=lfs diff=lfs merge=lfs -text
python/axmodels/feature_extractor.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f688c174b4ff91ccd1b0a0869e2cad4ff962f914edb74a5c0a26a2d540cee9
3
+ size 1543019
python/axmodels/motion_extractor.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10dfe954ba0c9d1ab31997ed8c142a7336fba58c8344635b0d91d0c6a0eae341
3
+ size 38150196
python/axmodels/spade_generator.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5a097161c26b85dee1d4bafbcbdd1097a9b843414fae9c62a505221a37cc793
3
+ size 63354167
python/axmodels/stitching_retargeting.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12bf1d10463da83ad90cbfabfc58002127bf39432436a105213a294147191cd7
3
+ size 60571
python/axmodels/warp.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:160c613a5c8fc49d0a8aca1eacfa208642b673fa5b25e08853ac40548106b53c
3
+ size 201167246
python/cropper.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.dependencies.insightface.app import FaceAnalysis
2
+ from utils.dependencies.insightface.app.common import Face
3
+ from utils.timer import Timer
4
+ from utils.human_landmark_runner import LandmarkRunner as HumanLandmark
5
+ from utils.crop import crop_image
6
+ from typing import List, Tuple, Union
7
+ from dataclasses import dataclass, field
8
+ import numpy as np
9
+ import os.path as osp
10
+ import cv2
11
+
12
+
13
+ def contiguous(obj):
14
+ if not obj.flags.c_contiguous:
15
+ obj = obj.copy(order="C")
16
+ return obj
17
+
18
+ @dataclass
19
+ class Trajectory:
20
+ start: int = -1 # start frame
21
+ end: int = -1 # end frame
22
+ lmk_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # lmk list
23
+ bbox_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # bbox list
24
+ M_c2o_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # M_c2o list
25
+
26
+ frame_rgb_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # frame list
27
+ lmk_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # lmk list
28
+ frame_rgb_crop_lst: Union[Tuple, List, np.ndarray] = field(default_factory=list) # frame crop list
29
+
30
+
31
+ def make_abs_path(fn):
32
+ return osp.join(osp.dirname(osp.realpath(__file__)), fn)
33
+
34
+
35
+ def sort_by_direction(faces, direction: str = 'large-small', face_center=None):
36
+ if len(faces) <= 0:
37
+ return faces
38
+ if direction == 'left-right':
39
+ return sorted(faces, key=lambda face: face['bbox'][0])
40
+ if direction == 'right-left':
41
+ return sorted(faces, key=lambda face: face['bbox'][0], reverse=True)
42
+ if direction == 'top-bottom':
43
+ return sorted(faces, key=lambda face: face['bbox'][1])
44
+ if direction == 'bottom-top':
45
+ return sorted(faces, key=lambda face: face['bbox'][1], reverse=True)
46
+ if direction == 'small-large':
47
+ return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]))
48
+ if direction == 'large-small':
49
+ return sorted(faces, key=lambda face: (face['bbox'][2] - face['bbox'][0]) * (face['bbox'][3] - face['bbox'][1]), reverse=True)
50
+ if direction == 'distance-from-retarget-face':
51
+ return sorted(faces, key=lambda face: (((face['bbox'][2]+face['bbox'][0])/2-face_center[0])**2+((face['bbox'][3]+face['bbox'][1])/2-face_center[1])**2)**0.5)
52
+ return faces
53
+
54
+
55
+ class FaceAnalysisDIY(FaceAnalysis):
56
+ def __init__(self, name='buffalo_l', root='~/.insightface', allowed_modules=None, **kwargs):
57
+ super().__init__(name=name, root=root, allowed_modules=allowed_modules, **kwargs)
58
+
59
+ self.timer = Timer()
60
+
61
+ def get(self, img_bgr, **kwargs):
62
+ max_num = kwargs.get('max_face_num', 0) # the number of the detected faces, 0 means no limit
63
+ flag_do_landmark_2d_106 = kwargs.get('flag_do_landmark_2d_106', True) # whether to do 106-point detection
64
+ direction = kwargs.get('direction', 'large-small') # sorting direction
65
+ face_center = None
66
+
67
+ bboxes, kpss = self.det_model.detect(img_bgr, max_num=max_num, metric='default')
68
+ if bboxes.shape[0] == 0:
69
+ return []
70
+ ret = []
71
+ for i in range(bboxes.shape[0]):
72
+ bbox = bboxes[i, 0:4]
73
+ det_score = bboxes[i, 4]
74
+ kps = None
75
+ if kpss is not None:
76
+ kps = kpss[i]
77
+ face = Face(bbox=bbox, kps=kps, det_score=det_score)
78
+ for taskname, model in self.models.items():
79
+ if taskname == 'detection':
80
+ continue
81
+
82
+ if (not flag_do_landmark_2d_106) and taskname == 'landmark_2d_106':
83
+ continue
84
+
85
+ # print(f'taskname: {taskname}')
86
+ model.get(img_bgr, face)
87
+ ret.append(face)
88
+
89
+ ret = sort_by_direction(ret, direction, face_center)
90
+ return ret
91
+
92
+ def warmup(self):
93
+ self.timer.tic()
94
+
95
+ img_bgr = np.zeros((512, 512, 3), dtype=np.uint8)
96
+ self.get(img_bgr)
97
+
98
+ elapse = self.timer.toc()
99
+ print(f'FaceAnalysisDIY warmup time: {elapse:.3f}s')
100
+
101
+
102
+ class Cropper(object):
103
+ def __init__(self, ):
104
+ self.face_analysis_wrapper_provider = ["CPUExecutionProvider"]
105
+ self.insightface_root: str = make_abs_path("./pretrained_weights/insightface")
106
+ self.device_id = 0
107
+ self.landmark_ckpt_path: str = make_abs_path("./pretrained_weights/liveportrait/landmark.onnx")
108
+ self.det_thresh: float = 0.1 # detection threshold
109
+ self.device = "cpu"
110
+ self.image_type = "human_face"
111
+ self.direction: str = "large-small" # direction of cropping
112
+ self.max_face_num: int = 0 # max face number, 0 mean no limit
113
+ self.dsize: int = 512 # crop size
114
+ self.scale: float = 2.3 # scale factor
115
+ self.vx_ratio: float = 0 # vx ratio
116
+ self.vy_ratio: float = -0.125 # vy ratio +up, -down
117
+ self.flag_do_rot: bool = True # whether to conduct the rotation when flag_do_crop is True
118
+
119
+ self.face_analysis_wrapper = FaceAnalysisDIY(
120
+ name="buffalo_l",
121
+ root=self.insightface_root,
122
+ providers=self.face_analysis_wrapper_provider,
123
+ )
124
+ self.face_analysis_wrapper.prepare(ctx_id=self.device_id, det_size=(512, 512), det_thresh=self.det_thresh)
125
+ self.face_analysis_wrapper.warmup()
126
+
127
+ self.human_landmark_runner = HumanLandmark(
128
+ ckpt_path=self.landmark_ckpt_path,
129
+ onnx_provider=self.device,
130
+ device_id=self.device_id,
131
+ )
132
+ self.human_landmark_runner.warmup()
133
+
134
+ def crop_source_image(self, img_rgb_: np.ndarray):
135
+ # crop a source image and get neccessary information
136
+ img_rgb = img_rgb_.copy() # copy it
137
+ img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
138
+
139
+ if self.image_type == "human_face":
140
+ src_face = self.face_analysis_wrapper.get(
141
+ img_bgr,
142
+ flag_do_landmark_2d_106=True,
143
+ direction=self.direction,
144
+ max_face_num=self.max_face_num,
145
+ )
146
+
147
+ if len(src_face) == 0:
148
+ log("No face detected in the source image.")
149
+ return None
150
+ elif len(src_face) > 1:
151
+ log(f"More than one face detected in the image, only pick one face by rule {self.direction}.")
152
+
153
+ # NOTE: temporarily only pick the first face, to support multiple face in the future
154
+ src_face = src_face[0]
155
+ lmk = src_face.landmark_2d_106 # this is the 106 landmarks from insightface
156
+ else:
157
+ tmp_dct = {
158
+ 'animal_face_9': 'animal_face',
159
+ 'animal_face_68': 'face'
160
+ }
161
+
162
+ img_rgb_pil = Image.fromarray(img_rgb)
163
+ lmk = self.animal_landmark_runner.run(
164
+ img_rgb_pil,
165
+ 'face',
166
+ tmp_dct[self.animal_face_type],
167
+ 0,
168
+ 0
169
+ )
170
+
171
+ # crop the face
172
+ ret_dct = crop_image(
173
+ img_rgb, # ndarray
174
+ lmk, # 106x2 or Nx2
175
+ dsize=self.dsize,
176
+ scale=self.scale,
177
+ vx_ratio=self.vx_ratio,
178
+ vy_ratio=self.vy_ratio,
179
+ flag_do_rot=self.flag_do_rot,
180
+ )
181
+
182
+ # update a 256x256 version for network input
183
+ ret_dct["img_crop_256x256"] = cv2.resize(ret_dct["img_crop"], (256, 256), interpolation=cv2.INTER_AREA)
184
+ cv2.imwrite("/data/tmp/yongqiang/LLM/projects/zr/liveportrait_onnx/img_crop.jpg", cv2.cvtColor(ret_dct["img_crop"], cv2.COLOR_BGR2RGB))
185
+ cv2.imwrite("/data/tmp/yongqiang/LLM/projects/zr/liveportrait_onnx/img_crop_256x256.jpg", cv2.cvtColor(ret_dct["img_crop_256x256"], cv2.COLOR_BGR2RGB))
186
+ if self.image_type == "human_face":
187
+ lmk = self.human_landmark_runner.run(img_rgb, lmk)
188
+ ret_dct["lmk_crop"] = lmk
189
+ ret_dct["lmk_crop_256x256"] = ret_dct["lmk_crop"] * 256 / self.dsize
190
+ else:
191
+ # 68x2 or 9x2
192
+ ret_dct["lmk_crop"] = lmk
193
+
194
+ return ret_dct
195
+
196
+
197
+ def calc_lmk_from_cropped_image(self, img_rgb_, **kwargs):
198
+ direction = kwargs.get("direction", "large-small")
199
+ src_face = self.face_analysis_wrapper.get(
200
+ contiguous(img_rgb_[..., ::-1]), # convert to BGR
201
+ flag_do_landmark_2d_106=True,
202
+ direction=direction,
203
+ )
204
+ if len(src_face) == 0:
205
+ log("No face detected in the source image.")
206
+ return None
207
+ elif len(src_face) > 1:
208
+ log(f"More than one face detected in the image, only pick one face by rule {direction}.")
209
+ src_face = src_face[0]
210
+ lmk = src_face.landmark_2d_106
211
+ lmk = self.human_landmark_runner.run(img_rgb_, lmk)
212
+
213
+ return lmk
214
+
215
+ def calc_lmks_from_cropped_video(self, driving_rgb_crop_lst, **kwargs):
216
+ """Tracking based landmarks/alignment"""
217
+ trajectory = Trajectory()
218
+ direction = kwargs.get("direction", "large-small")
219
+
220
+ for idx, frame_rgb_crop in enumerate(driving_rgb_crop_lst):
221
+ if idx == 0 or trajectory.start == -1:
222
+ src_face = self.face_analysis_wrapper.get(
223
+ contiguous(frame_rgb_crop[..., ::-1]), # convert to BGR
224
+ flag_do_landmark_2d_106=True,
225
+ direction=direction,
226
+ )
227
+ if len(src_face) == 0:
228
+ log(f"No face detected in the frame #{idx}")
229
+ raise Exception(f"No face detected in the frame #{idx}")
230
+ elif len(src_face) > 1:
231
+ log(f"More than one face detected in the driving frame_{idx}, only pick one face by rule {direction}.")
232
+ src_face = src_face[0]
233
+ lmk = src_face.landmark_2d_106
234
+ lmk = self.human_landmark_runner.run(frame_rgb_crop, lmk)
235
+ trajectory.start, trajectory.end = idx, idx
236
+ else:
237
+ lmk = self.human_landmark_runner.run(frame_rgb_crop, trajectory.lmk_lst[-1])
238
+ trajectory.end = idx
239
+
240
+ trajectory.lmk_lst.append(lmk)
241
+ return trajectory.lmk_lst
242
+
python/infer.py ADDED
@@ -0,0 +1,894 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import cv2
3
+ import numpy as np
4
+ import os
5
+ import onnxruntime as ort
6
+ from axengine import InferenceSession
7
+ import numpy as np
8
+ import cv2
9
+ import argparse
10
+ import os.path as osp
11
+ from loguru import logger
12
+ from numpy import ndarray
13
+ import pickle as pkl
14
+ import torch
15
+ import torch.nn.functional as F
16
+ from cropper import Cropper
17
+ import imageio
18
+ import subprocess
19
+ from utils.timer import Timer
20
+ from typing import Union
21
+ from scipy.spatial import ConvexHull # pylint: disable=E0401,E0611
22
+
23
+
24
+ appearance_feature_extractor, motion_extractor, warping_module, spade_generator, stitching_retargeting_module = None, None, None, None, None
25
+
26
+
27
+ def parse_args() -> argparse.Namespace:
28
+ parser = argparse.ArgumentParser(
29
+ prog="LivePortrait",
30
+ description="LivePortrait: A Real-time 3D Live Portrait Animation System"
31
+ )
32
+ parser.add_argument(
33
+ "--source",
34
+ type=str,
35
+ required=True,
36
+ help="Path to source image.",
37
+ )
38
+ parser.add_argument(
39
+ "--driving",
40
+ type=str,
41
+ required=True,
42
+ help="Path to driving image.",
43
+ )
44
+ parser.add_argument(
45
+ "--models",
46
+ type=str,
47
+ required=True,
48
+ help="Path to onnx models.",
49
+ )
50
+ parser.add_argument(
51
+ "--output-dir",
52
+ type=str,
53
+ default="./output",
54
+ help="Path to infer results.",
55
+ )
56
+
57
+ return parser.parse_args()
58
+
59
+
60
+ def images2video(images, wfp, **kwargs):
61
+ fps = kwargs.get('fps', 30)
62
+ video_format = kwargs.get('format', 'mp4') # default is mp4 format
63
+ codec = kwargs.get('codec', 'libx264') # default is libx264 encoding
64
+ quality = kwargs.get('quality') # video quality
65
+ pixelformat = kwargs.get('pixelformat', 'yuv420p') # video pixel format
66
+ image_mode = kwargs.get('image_mode', 'rgb')
67
+ macro_block_size = kwargs.get('macro_block_size', 2)
68
+ ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))]
69
+
70
+ writer = imageio.get_writer(
71
+ wfp, fps=fps, format=video_format,
72
+ codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size
73
+ )
74
+
75
+ n = len(images)
76
+ for i in range(n):
77
+ if image_mode.lower() == 'bgr':
78
+ writer.append_data(images[i][..., ::-1])
79
+ else:
80
+ writer.append_data(images[i])
81
+
82
+ writer.close()
83
+
84
+
85
+ def has_audio_stream(video_path: str) -> bool:
86
+ """
87
+ Check if the video file contains an audio stream.
88
+
89
+ :param video_path: Path to the video file
90
+ :return: True if the video contains an audio stream, False otherwise
91
+ """
92
+ if osp.isdir(video_path):
93
+ return False
94
+
95
+ cmd = [
96
+ 'ffprobe',
97
+ '-v', 'error',
98
+ '-select_streams', 'a',
99
+ '-show_entries', 'stream=codec_type',
100
+ '-of', 'default=noprint_wrappers=1:nokey=1',
101
+ f'"{video_path}"'
102
+ ]
103
+
104
+ try:
105
+ # result = subprocess.run(cmd, capture_output=True, text=True)
106
+ result = exec_cmd(' '.join(cmd))
107
+ if result.returncode != 0:
108
+ logger.info(f"Error occurred while probing video: {result.stderr}")
109
+ return False
110
+
111
+ # Check if there is any output from ffprobe command
112
+ return bool(result.stdout.strip())
113
+ except Exception as e:
114
+ logger.info(
115
+ f"Error occurred while probing video: {video_path}, "
116
+ "you may need to install ffprobe! (https://ffmpeg.org/download.html) "
117
+ "Now set audio to false!",
118
+ style="bold red"
119
+ )
120
+ return False
121
+
122
+
123
+ def tensor_to_numpy(data: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
124
+ """transform torch.Tensor into numpy.ndarray"""
125
+ if isinstance(data, torch.Tensor):
126
+ return data.data.cpu().numpy()
127
+ return data
128
+
129
+
130
+ def calc_motion_multiplier(
131
+ kp_source: Union[np.ndarray, torch.Tensor],
132
+ kp_driving_initial: Union[np.ndarray, torch.Tensor]
133
+ ) -> float:
134
+ """calculate motion_multiplier based on the source image and the first driving frame"""
135
+ kp_source_np = tensor_to_numpy(kp_source)
136
+ kp_driving_initial_np = tensor_to_numpy(kp_driving_initial)
137
+
138
+ source_area = ConvexHull(kp_source_np.squeeze(0)).volume
139
+ driving_area = ConvexHull(kp_driving_initial_np.squeeze(0)).volume
140
+ motion_multiplier = np.sqrt(source_area) / np.sqrt(driving_area)
141
+ # motion_multiplier = np.cbrt(source_area) / np.cbrt(driving_area)
142
+
143
+ return motion_multiplier
144
+
145
+
146
+ def load_video(video_info, n_frames=-1):
147
+ reader = imageio.get_reader(video_info, "ffmpeg")
148
+
149
+ ret = []
150
+ for idx, frame_rgb in enumerate(reader):
151
+ if n_frames > 0 and idx >= n_frames:
152
+ break
153
+ ret.append(frame_rgb)
154
+
155
+ reader.close()
156
+ return ret
157
+
158
+
159
+ def fast_check_ffmpeg():
160
+ try:
161
+ subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
162
+ return True
163
+ except:
164
+ return False
165
+
166
+
167
+ def is_video(file_path):
168
+ if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path):
169
+ return True
170
+ return False
171
+
172
+
173
+ def is_image(file_path):
174
+ image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')
175
+ return file_path.lower().endswith(image_extensions)
176
+
177
+
178
+ def get_fps(filepath, default_fps=25):
179
+ try:
180
+ fps = cv2.VideoCapture(filepath).get(cv2.CAP_PROP_FPS)
181
+
182
+ if fps in (0, None):
183
+ fps = default_fps
184
+ except Exception as e:
185
+ logger.info(e)
186
+ fps = default_fps
187
+
188
+ return fps
189
+
190
+
191
+ def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray:
192
+ return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /
193
+ (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))
194
+
195
+
196
+ def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:
197
+ lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)
198
+ righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)
199
+ if target_eye_ratio is not None:
200
+ return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)
201
+ else:
202
+ return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)
203
+
204
+
205
+ def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:
206
+ return calculate_distance_ratio(lmk, 90, 102, 48, 66)
207
+
208
+
209
+ def concat_frames(driving_image_lst, source_image_lst, I_p_lst):
210
+ # TODO: add more concat style, e.g., left-down corner driving
211
+ out_lst = []
212
+ h, w, _ = I_p_lst[0].shape
213
+ source_image_resized_lst = [cv2.resize(img, (w, h)) for img in source_image_lst]
214
+
215
+ for idx, _ in enumerate(I_p_lst):
216
+ I_p = I_p_lst[idx]
217
+ source_image_resized = source_image_resized_lst[idx] if len(source_image_lst) > 1 else source_image_resized_lst[0]
218
+
219
+ if driving_image_lst is None:
220
+ out = np.hstack((source_image_resized, I_p))
221
+ else:
222
+ driving_image = driving_image_lst[idx]
223
+ driving_image_resized = cv2.resize(driving_image, (w, h))
224
+ out = np.hstack((driving_image_resized, source_image_resized, I_p))
225
+
226
+ out_lst.append(out)
227
+ return out_lst
228
+
229
+
230
+ def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
231
+ """
232
+ kp_source: (bs, k, 3)
233
+ kp_driving: (bs, k, 3)
234
+ Return: (bs, 2k*3)
235
+ """
236
+ bs_src = kp_source.shape[0]
237
+ bs_dri = kp_driving.shape[0]
238
+ assert bs_src == bs_dri, 'batch size must be equal'
239
+
240
+ feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)
241
+ return feat
242
+
243
+
244
+ DTYPE = np.float32
245
+ CV2_INTERP = cv2.INTER_LINEAR
246
+
247
+
248
+ def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
249
+ """ conduct similarity or affine transformation to the image, do not do border operation!
250
+ img:
251
+ M: 2x3 matrix or 3x3 matrix
252
+ dsize: target shape (width, height)
253
+ """
254
+ if isinstance(dsize, tuple) or isinstance(dsize, list):
255
+ _dsize = tuple(dsize)
256
+ else:
257
+ _dsize = (dsize, dsize)
258
+
259
+ if borderMode is not None:
260
+ return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
261
+ else:
262
+ return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
263
+
264
+
265
+ def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
266
+ """prepare mask for later image paste back
267
+ """
268
+ mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
269
+ mask_ori = mask_ori.astype(np.float32) / 255.
270
+ return mask_ori
271
+
272
+
273
+ def paste_back(img_crop, M_c2o, img_ori, mask_ori):
274
+ """paste back the image
275
+ """
276
+ dsize = (img_ori.shape[1], img_ori.shape[0])
277
+ result = _transform_img(img_crop, M_c2o, dsize=dsize)
278
+ result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
279
+ return result
280
+
281
+
282
+ def prefix(filename):
283
+ """a.jpg -> a"""
284
+ pos = filename.rfind(".")
285
+ if pos == -1:
286
+ return filename
287
+ return filename[:pos]
288
+
289
+
290
+ def basename(filename):
291
+ """a/b/c.jpg -> c"""
292
+ return prefix(osp.basename(filename))
293
+
294
+
295
+ def mkdir(d, log=False):
296
+ # return self-assined `d`, for one line code
297
+ if not osp.exists(d):
298
+ os.makedirs(d, exist_ok=True)
299
+ if log:
300
+ logger.info(f"Make dir: {d}")
301
+ return d
302
+
303
+
304
+ def dct2device(dct: dict, device):
305
+ for key in dct:
306
+ if isinstance(dct[key], torch.Tensor):
307
+ dct[key] = dct[key].to(device)
308
+ else:
309
+ dct[key] = torch.tensor(dct[key]).to(device)
310
+ return dct
311
+
312
+
313
+ PI = np.pi
314
+
315
+ def headpose_pred_to_degree(pred):
316
+ """
317
+ pred: (bs, 66) or (bs, 1) or others
318
+ """
319
+ if pred.ndim > 1 and pred.shape[1] == 66:
320
+ # NOTE: note that the average is modified to 97.5
321
+ device = pred.device
322
+ idx_tensor = [idx for idx in range(0, 66)]
323
+ idx_tensor = torch.FloatTensor(idx_tensor).to(device)
324
+ pred = F.softmax(pred, dim=1)
325
+ degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5
326
+
327
+ return degree
328
+
329
+ return pred
330
+
331
+
332
+ def get_rotation_matrix(pitch_, yaw_, roll_):
333
+ """ the input is in degree
334
+ """
335
+ # transform to radian
336
+ pitch = pitch_ / 180 * PI
337
+ yaw = yaw_ / 180 * PI
338
+ roll = roll_ / 180 * PI
339
+
340
+ device = pitch.device
341
+
342
+ if pitch.ndim == 1:
343
+ pitch = pitch.unsqueeze(1)
344
+ if yaw.ndim == 1:
345
+ yaw = yaw.unsqueeze(1)
346
+ if roll.ndim == 1:
347
+ roll = roll.unsqueeze(1)
348
+
349
+ # calculate the euler matrix
350
+ bs = pitch.shape[0]
351
+ ones = torch.ones([bs, 1]).to(device)
352
+ zeros = torch.zeros([bs, 1]).to(device)
353
+ x, y, z = pitch, yaw, roll
354
+
355
+ rot_x = torch.cat([
356
+ ones, zeros, zeros,
357
+ zeros, torch.cos(x), -torch.sin(x),
358
+ zeros, torch.sin(x), torch.cos(x)
359
+ ], dim=1).reshape([bs, 3, 3])
360
+
361
+ rot_y = torch.cat([
362
+ torch.cos(y), zeros, torch.sin(y),
363
+ zeros, ones, zeros,
364
+ -torch.sin(y), zeros, torch.cos(y)
365
+ ], dim=1).reshape([bs, 3, 3])
366
+
367
+ rot_z = torch.cat([
368
+ torch.cos(z), -torch.sin(z), zeros,
369
+ torch.sin(z), torch.cos(z), zeros,
370
+ zeros, zeros, ones
371
+ ], dim=1).reshape([bs, 3, 3])
372
+
373
+ rot = rot_z @ rot_y @ rot_x
374
+ return rot.permute(0, 2, 1) # transpose
375
+
376
+
377
+ def make_abs_path(fn):
378
+ return osp.join(osp.dirname(osp.realpath(__file__)), fn)
379
+
380
+
381
+ def load_image_rgb(image_path: str):
382
+ if not osp.exists(image_path):
383
+ raise FileNotFoundError(f"Image not found: {image_path}")
384
+ img = cv2.imread(image_path, cv2.IMREAD_COLOR)
385
+ return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
386
+
387
+
388
+ def resize_to_limit(img: np.ndarray, max_dim=1920, division=2):
389
+ """
390
+ ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
391
+ :param img: the image to be processed.
392
+ :param max_dim: the maximum dimension constraint.
393
+ :param n: the number that needs to be multiples of.
394
+ :return: the adjusted image.
395
+ """
396
+ h, w = img.shape[:2]
397
+
398
+ # ajust the size of the image according to the maximum dimension
399
+ if max_dim > 0 and max(h, w) > max_dim:
400
+ if h > w:
401
+ new_h = max_dim
402
+ new_w = int(w * (max_dim / h))
403
+ else:
404
+ new_w = max_dim
405
+ new_h = int(h * (max_dim / w))
406
+ img = cv2.resize(img, (new_w, new_h))
407
+
408
+ # ensure that the image dimensions are multiples of n
409
+ division = max(division, 1)
410
+ new_h = img.shape[0] - (img.shape[0] % division)
411
+ new_w = img.shape[1] - (img.shape[1] % division)
412
+
413
+ if new_h == 0 or new_w == 0:
414
+ # when the width or height is less than n, no need to process
415
+ return img
416
+
417
+ if new_h != img.shape[0] or new_w != img.shape[1]:
418
+ img = img[:new_h, :new_w]
419
+
420
+ return img
421
+
422
+
423
+ def preprocess(input_data):
424
+ img_rgb = load_image_rgb(input_data)
425
+ img_rgb = resize_to_limit(img_rgb)
426
+ return [img_rgb]
427
+
428
+
429
+ def postprocess(output_data):
430
+ # Implement your postprocessing steps here
431
+ # For example, you might convert the output to a specific format
432
+ return output_data
433
+
434
+
435
+ def infer(model, input_data):
436
+ input_name = model.get_inputs()[0].name
437
+ output_name = model.get_outputs()[0].name
438
+ input_data = preprocess(input_data) # rgb, resize & limit
439
+ result = model.run([output_name], {input_name: input_data})
440
+ return postprocess(result)
441
+
442
+
443
+ def partial_fields(target_class, kwargs):
444
+ return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
445
+
446
+
447
+ def calc_ratio(lmk_lst):
448
+ input_eye_ratio_lst = []
449
+ input_lip_ratio_lst = []
450
+ for lmk in lmk_lst:
451
+ # for eyes retargeting
452
+ input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
453
+ # for lip retargeting
454
+ input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
455
+ return input_eye_ratio_lst, input_lip_ratio_lst
456
+
457
+
458
+ def prepare_videos(imgs) -> torch.Tensor:
459
+ """ construct the input as standard
460
+ imgs: NxBxHxWx3, uint8
461
+ """
462
+ device = "cpu"
463
+ if isinstance(imgs, list):
464
+ _imgs = np.array(imgs)[..., np.newaxis] # TxHxWx3x1
465
+ elif isinstance(imgs, np.ndarray):
466
+ _imgs = imgs
467
+ else:
468
+ raise ValueError(f'imgs type error: {type(imgs)}')
469
+
470
+ y = _imgs.astype(np.float32) / 255.
471
+ y = np.clip(y, 0, 1) # clip to 0~1
472
+ y = torch.from_numpy(y).permute(0, 4, 3, 1, 2) # TxHxWx3x1 -> Tx1x3xHxW
473
+ y = y.to(device)
474
+
475
+ return y
476
+
477
+
478
+ def get_kp_info(x: torch.Tensor) -> dict:
479
+ """ get the implicit keypoint information
480
+ x: Bx3xHxW, normalized to 0~1
481
+ flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
482
+ return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
483
+ """
484
+ outs = motion_extractor.run(None, input_feed={"input": x.numpy()}) # TODO: axengine 中的 run 输入参数与 ort 还是些许不同
485
+ # import pdb; pdb.set_trace()
486
+ # outs = list(outs.values())
487
+ kp_info = {}
488
+ kp_info['pitch'] = torch.from_numpy(outs[0])
489
+ kp_info['yaw'] = torch.from_numpy(outs[1])
490
+ kp_info['roll'] = torch.from_numpy(outs[2])
491
+ kp_info['t'] = torch.from_numpy(outs[3])
492
+ kp_info['exp'] = torch.from_numpy(outs[4])
493
+ kp_info['scale'] = torch.from_numpy(outs[5])
494
+ kp_info['kp'] = torch.from_numpy(outs[6])
495
+
496
+ flag_refine_info: bool = True
497
+ if flag_refine_info:
498
+ bs = kp_info['kp'].shape[0]
499
+ kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None] # Bx1
500
+ kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None] # Bx1
501
+ kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None] # Bx1
502
+ kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3) # BxNx3
503
+ kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3) # BxNx3
504
+
505
+ return kp_info
506
+
507
+
508
+ def transform_keypoint(kp_info: dict):
509
+ """
510
+ transform the implicit keypoints with the pose, shift, and expression deformation
511
+ kp: BxNx3
512
+ """
513
+ kp = kp_info['kp'] # (bs, k, 3)
514
+ pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
515
+
516
+ t, exp = kp_info['t'], kp_info['exp']
517
+ scale = kp_info['scale']
518
+ pitch = headpose_pred_to_degree(pitch)
519
+ yaw = headpose_pred_to_degree(yaw)
520
+ roll = headpose_pred_to_degree(roll)
521
+
522
+ bs = kp.shape[0]
523
+ if kp.ndim == 2:
524
+ num_kp = kp.shape[1] // 3 # Bx(num_kpx3)
525
+ else:
526
+ num_kp = kp.shape[1] # Bxnum_kpx3
527
+
528
+ rot_mat = get_rotation_matrix(pitch, yaw, roll) # (bs, 3, 3), 欧拉角转换为旋转矩阵
529
+
530
+ # Eqn.2: s * (R * x_c,s + exp) + t
531
+ kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
532
+ kp_transformed *= scale[..., None] # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
533
+ kp_transformed[:, :, 0:2] += t[:, None, 0:2] # remove z, only apply tx ty
534
+
535
+ return kp_transformed
536
+
537
+
538
+ def make_motion_template(I_lst, c_eyes_lst, c_lip_lst, **kwargs):
539
+ n_frames = I_lst.shape[0]
540
+ template_dct = {
541
+ 'n_frames': n_frames,
542
+ 'output_fps': kwargs.get('output_fps', 25),
543
+ 'motion': [],
544
+ 'c_eyes_lst': [],
545
+ 'c_lip_lst': [],
546
+ }
547
+
548
+ for i in range(n_frames):
549
+ # collect s, R, δ and t for inference
550
+ I_i = I_lst[i]
551
+ x_i_info = get_kp_info(I_i)
552
+ x_s = transform_keypoint(x_i_info)
553
+ R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
554
+
555
+ item_dct = {
556
+ 'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
557
+ 'R': R_i.cpu().numpy().astype(np.float32),
558
+ 'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
559
+ 't': x_i_info['t'].cpu().numpy().astype(np.float32),
560
+ 'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
561
+ 'x_s': x_s.cpu().numpy().astype(np.float32),
562
+ }
563
+
564
+ template_dct['motion'].append(item_dct)
565
+
566
+ c_eyes = c_eyes_lst[i].astype(np.float32)
567
+ template_dct['c_eyes_lst'].append(c_eyes)
568
+
569
+ c_lip = c_lip_lst[i].astype(np.float32)
570
+ template_dct['c_lip_lst'].append(c_lip)
571
+
572
+ return template_dct
573
+
574
+
575
+ def prepare_source(img: np.ndarray) -> torch.Tensor:
576
+ """ construct the input as standard
577
+ img: HxWx3, uint8, 256x256
578
+ """
579
+ device = "cpu"
580
+ h, w = img.shape[:2]
581
+ x = img.copy()
582
+
583
+ if x.ndim == 3:
584
+ x = x[np.newaxis].astype(np.float32) / 255. # HxWx3 -> 1xHxWx3, normalized to 0~1
585
+ elif x.ndim == 4:
586
+ x = x.astype(np.float32) / 255. # BxHxWx3, normalized to 0~1
587
+ else:
588
+ raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
589
+ x = np.clip(x, 0, 1) # clip to 0~1
590
+ x = torch.from_numpy(x).permute(0, 3, 1, 2) # 1xHxWx3 -> 1x3xHxW
591
+ x = x.to(device)
592
+ return x
593
+
594
+
595
+ def extract_feature_3d(x: torch.Tensor) -> torch.Tensor:
596
+ """ get the appearance feature of the image by F
597
+ x: Bx3xHxW, normalized to 0~1
598
+ """
599
+ outs = appearance_feature_extractor.run(None, input_feed={"input": x.numpy()})[0]
600
+ # outs = list(outs.values())[0]
601
+ # import pdb; pdb.set_trace()
602
+ return torch.from_numpy(outs)
603
+
604
+
605
+ def stitch(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
606
+ """
607
+ kp_source: BxNx3
608
+ kp_driving: BxNx3
609
+ Return: Bx(3*num_kp+2)
610
+ """
611
+ feat_stiching = concat_feat(kp_source, kp_driving)
612
+ delta = stitching_retargeting_module.run(None, input_feed={"input": feat_stiching.numpy()})[0]
613
+ # delta = list(delta.values())[0]
614
+ return torch.from_numpy(delta)
615
+
616
+
617
+ def stitching(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
618
+ """ conduct the stitching
619
+ kp_source: Bxnum_kpx3
620
+ kp_driving: Bxnum_kpx3
621
+ """
622
+
623
+ bs, num_kp = kp_source.shape[:2]
624
+
625
+ kp_driving_new = kp_driving.clone()
626
+ delta = stitch(kp_source, kp_driving_new)
627
+
628
+ delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3) # 1x20x3
629
+ delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2) # 1x1x2
630
+
631
+ kp_driving_new += delta_exp
632
+ kp_driving_new[..., :2] += delta_tx_ty
633
+
634
+ return kp_driving_new
635
+
636
+
637
+ def warp_decode(feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
638
+ """ get the image after the warping of the implicit keypoints
639
+ feature_3d: Bx32x16x64x64, feature volume
640
+ kp_source: BxNx3
641
+ kp_driving: BxNx3
642
+ """
643
+ warp_timer = Timer()
644
+ warp_timer.tic()
645
+ outs = warping_module.run([], {"feature_3d": feature_3d.numpy(), "kp_driving": kp_driving.numpy(), "kp_source": kp_source.numpy()})[2]
646
+ warp_timer.toc()
647
+ logger.debug(f'warp time: {warp_timer.diff:.3f}s')
648
+ # outs = warping_module.run(input_feed={"feature_3d": feature_3d.numpy(), "kp_driving": kp_driving.numpy(), "kp_source": kp_source.numpy()})['out']
649
+ outs = spade_generator.run(None, input_feed={"input": outs})[0]
650
+ # outs = list(outs.values())[0]
651
+ ret_dct = {}
652
+ ret_dct['out'] = torch.from_numpy(outs)
653
+ return ret_dct
654
+
655
+
656
+ def parse_output(out: torch.Tensor) -> np.ndarray:
657
+ """ construct the output as standard
658
+ return: 1xHxWx3, uint8
659
+ """
660
+ out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1]) # 1x3xHxW -> 1xHxWx3
661
+ out = np.clip(out, 0, 1) # clip to 0~1
662
+ out = np.clip(out * 255, 0, 255).astype(np.uint8) # 0~1 -> 0~255
663
+
664
+ return out
665
+
666
+
667
+ def load_model(model_type, model_path=None):
668
+ if model_type == 'appearance_feature_extractor':
669
+ model = InferenceSession(f"{model_path}/feature_extractor.axmodel")
670
+ elif model_type == 'motion_extractor':
671
+ model = InferenceSession(f'{model_path}/motion_extractor.axmodel')
672
+ elif model_type == 'warping_module':
673
+ model = ort.InferenceSession(f'{model_path}/warp.onnx', providers=["CPUExecutionProvider"])
674
+ # model = InferenceSession(f'{model_path}/warp.axmodel')
675
+ elif model_type == 'spade_generator':
676
+ model = InferenceSession(f'{model_path}/spade_generator.axmodel')
677
+ elif model_type == 'stitching_retargeting_module':
678
+ model = InferenceSession(f'{model_path}/stitching_retargeting.axmodel')
679
+ return model
680
+
681
+
682
+ def main():
683
+ args = parse_args()
684
+
685
+ global appearance_feature_extractor
686
+ appearance_feature_extractor = load_model("appearance_feature_extractor", args.models)
687
+
688
+ global motion_extractor
689
+ motion_extractor = load_model("motion_extractor", args.models)
690
+
691
+ global warping_module
692
+ warping_module = load_model("warping_module", args.models)
693
+
694
+ global spade_generator
695
+ spade_generator = load_model("spade_generator", args.models)
696
+
697
+ global stitching_retargeting_module
698
+ stitching_retargeting_module = load_model("stitching_retargeting_module", args.models)
699
+
700
+ source = args.source
701
+ driving = args.driving
702
+
703
+ ffmpeg_dir = os.path.join(os.getcwd(), "ffmpeg")
704
+ if osp.exists(ffmpeg_dir):
705
+ os.environ["PATH"] += (os.pathsep + ffmpeg_dir)
706
+
707
+ if not fast_check_ffmpeg():
708
+ raise ImportError(
709
+ "FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html"
710
+ )
711
+
712
+ source_rgb_lst = preprocess(source) # rgb, resize & limit
713
+ if is_video(args.driving):
714
+ flag_is_driving_video = True
715
+ # load from video file, AND make motion template
716
+ output_fps = int(get_fps(args.driving))
717
+ driving_rgb_lst = load_video(args.driving)
718
+ elif is_image(args.driving):
719
+ flag_is_driving_video = False
720
+ output_fps = 25
721
+ driving_rgb_lst = [load_image_rgb(driving)] # rgb
722
+ else:
723
+ raise Exception(f"{args.driving} is not a supported type!")
724
+
725
+ ######## make motion template ########
726
+ cropper: Cropper = Cropper()
727
+ logger.info("Start making driving motion template...")
728
+ driving_n_frames = len(driving_rgb_lst)
729
+ n_frames = driving_n_frames
730
+ driving_lmk_crop_lst = cropper.calc_lmks_from_cropped_video(driving_rgb_lst) # cropper.
731
+ driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst] # force to resize to 256x256
732
+ #######################################
733
+
734
+ c_d_eyes_lst, c_d_lip_lst = calc_ratio(driving_lmk_crop_lst)
735
+ # save the motion template
736
+ I_d_lst = prepare_videos(driving_rgb_crop_256x256_lst)
737
+ driving_template_dct = make_motion_template(I_d_lst, c_d_eyes_lst, c_d_lip_lst, output_fps=output_fps)
738
+ # wfp_template = remove_suffix(args.driving) + '.pkl'
739
+ # dump(wfp_template, driving_template_dct)
740
+ # logger.info(f"Dump motion template to {wfp_template}")
741
+
742
+ if not flag_is_driving_video:
743
+ c_d_eyes_lst = c_d_eyes_lst * n_frames
744
+ c_d_lip_lst = c_d_lip_lst * n_frames
745
+
746
+ I_p_pstbk_lst = []
747
+ logger.info("Prepared pasteback mask done.")
748
+
749
+ I_p_lst = []
750
+ R_d_0, x_d_0_info = None, None
751
+ flag_normalize_lip = False # inf_cfg.flag_normalize_lip # not overwrite
752
+ flag_source_video_eye_retargeting = False # inf_cfg.flag_source_video_eye_retargeting # not overwrite
753
+ lip_delta_before_animation, eye_delta_before_animation = None, None
754
+
755
+ ######## process source info ########
756
+ # if the input is a source image, process it only once
757
+ flag_do_crop = True
758
+ if flag_do_crop:
759
+ crop_info = cropper.crop_source_image(source_rgb_lst[0])
760
+ if crop_info is None:
761
+ raise Exception("No face detected in the source image!")
762
+ source_lmk = crop_info['lmk_crop']
763
+ img_crop_256x256 = crop_info['img_crop_256x256']
764
+ else:
765
+ source_lmk = cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
766
+ img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256)) # force to resize to 256x256
767
+
768
+ I_s = prepare_source(img_crop_256x256)
769
+ x_s_info = get_kp_info(I_s)
770
+ x_c_s = x_s_info['kp']
771
+ R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
772
+ f_s = extract_feature_3d(I_s)
773
+ x_s = transform_keypoint(x_s_info)
774
+
775
+ # let lip-open scalar to be 0 at first
776
+ mask_crop: ndarray = cv2.imread(make_abs_path('./utils/resources/mask_template.png'), cv2.IMREAD_COLOR)
777
+ mask_ori_float = prepare_paste_back(mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))
778
+
779
+ with open(make_abs_path('./utils/resources/lip_array.pkl'), 'rb') as f:
780
+ lip_array = pkl.load(f)
781
+ device = "cpu"
782
+ flag_is_source_video = False
783
+ ######## animate ########
784
+ if flag_is_driving_video: # or (flag_is_source_video and not flag_is_driving_video)
785
+ logger.info(f"The animated video consists of {n_frames} frames.")
786
+ else:
787
+ logger.info(f"The output of image-driven portrait animation is an image.")
788
+ for i in range(n_frames):
789
+ x_d_i_info = driving_template_dct['motion'][i]
790
+ x_d_i_info = dct2device(x_d_i_info, device)
791
+ R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d'] # compatible with previous keys
792
+
793
+ if i == 0: # cache the first frame
794
+ R_d_0 = R_d_i
795
+ x_d_0_info = x_d_i_info.copy()
796
+
797
+ delta_new = x_s_info['exp'].clone()
798
+ R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
799
+ if flag_is_driving_video:
800
+ delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
801
+ else:
802
+ delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
803
+ # delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
804
+ scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
805
+ t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
806
+ t_new[..., 2].fill_(0) # zero tz
807
+ x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
808
+
809
+ if i == 0 and flag_is_driving_video:
810
+ x_d_0_new = x_d_i_new
811
+ motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
812
+ # motion_multiplier *= inf_cfg.driving_multiplier
813
+ x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
814
+ x_d_i_new = x_d_diff + x_s
815
+
816
+ # Algorithm 1:
817
+ # with stitching and without retargeting
818
+ x_d_i_new = stitching(x_s, x_d_i_new)
819
+ x_d_i_new = x_s + (x_d_i_new - x_s) * 1.0
820
+ out = warp_decode(f_s, x_s, x_d_i_new)
821
+ I_p_i = parse_output(out['out'])[0]
822
+ I_p_lst.append(I_p_i)
823
+ I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], source_rgb_lst[0], mask_ori_float)
824
+ I_p_pstbk_lst.append(I_p_pstbk)
825
+
826
+ mkdir(args.output_dir)
827
+ wfp_concat = None
828
+ ######### build the final concatenation result #########
829
+ # driving frame | source frame | generation
830
+ frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
831
+
832
+ if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
833
+ flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)
834
+ flag_driving_has_audio = has_audio_stream(args.driving)
835
+
836
+ wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')
837
+
838
+ # NOTE: update output fps
839
+ output_fps = source_fps if flag_is_source_video else output_fps
840
+ images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)
841
+
842
+ if flag_source_has_audio or flag_driving_has_audio:
843
+ # final result with concatenation
844
+ wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
845
+ audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
846
+ logger.info(f"Audio is selected from {audio_from_which_video}, concat mode")
847
+ add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
848
+ os.replace(wfp_concat_with_audio, wfp_concat)
849
+ logger.info(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
850
+
851
+ # save the animated result
852
+ wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')
853
+ if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
854
+ images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)
855
+ else:
856
+ images2video(I_p_lst, wfp=wfp, fps=output_fps)
857
+
858
+ ######### build the final result #########
859
+ if flag_source_has_audio or flag_driving_has_audio:
860
+ wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
861
+ audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
862
+ logger.info(f"Audio is selected from {audio_from_which_video}")
863
+ add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
864
+ os.replace(wfp_with_audio, wfp)
865
+ logger.info(f"Replace {wfp_with_audio} with {wfp}")
866
+
867
+ # final log
868
+ # if wfp_template not in (None, ''):
869
+ # logger.info(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')
870
+ logger.info(f'Animated video: {wfp}')
871
+ logger.info(f'Animated video with concat: {wfp_concat}')
872
+ else:
873
+ wfp_concat = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}_concat.jpg')
874
+ cv2.imwrite(wfp_concat, frames_concatenated[0][..., ::-1])
875
+ wfp = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}.jpg')
876
+ if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
877
+ cv2.imwrite(wfp, I_p_pstbk_lst[0][..., ::-1])
878
+ else:
879
+ cv2.imwrite(wfp, frames_concatenated[0][..., ::-1])
880
+ # final log
881
+ logger.info(f'Animated image: {wfp}')
882
+ logger.info(f'Animated image with concat: {wfp_concat}')
883
+
884
+
885
+ if __name__ == "__main__":
886
+ """
887
+ Usage:
888
+ python3 infer.py --source ../assets/examples/source/s0.jpg --driving ../assets/examples/driving/d8.jpg --models ./axmdoels --output-dir ./axmodel_infer
889
+ """
890
+ timer = Timer()
891
+ timer.tic()
892
+ main()
893
+ elapse = timer.toc()
894
+ logger.debug(f'LivePortrait axmodel infer time: {elapse:.3f}s')
python/infer_onnx.py ADDED
@@ -0,0 +1,952 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import cv2
3
+ import numpy as np
4
+ import os
5
+ import onnxruntime as ort
6
+ import numpy as np
7
+ import cv2
8
+ import argparse
9
+ import os.path as osp
10
+ from loguru import logger
11
+ from numpy import ndarray
12
+ import pickle as pkl
13
+ import torch
14
+ import torch.nn.functional as F
15
+ from cropper import Cropper
16
+ import imageio
17
+ import subprocess
18
+ from utils.timer import Timer
19
+ from typing import Union
20
+ from scipy.spatial import ConvexHull # pylint: disable=E0401,E0611
21
+
22
+
23
+ appearance_feature_extractor, motion_extractor, warping_module, spade_generator, stitching_retargeting_module = None, None, None, None, None
24
+
25
+
26
+ def parse_args() -> argparse.Namespace:
27
+ parser = argparse.ArgumentParser(
28
+ prog="LivePortrait",
29
+ description="LivePortrait: A Real-time 3D Live Portrait Animation System"
30
+ )
31
+ parser.add_argument(
32
+ "--source",
33
+ type=str,
34
+ required=True,
35
+ help="Path to source image.",
36
+ )
37
+ parser.add_argument(
38
+ "--driving",
39
+ type=str,
40
+ required=True,
41
+ help="Path to driving image.",
42
+ )
43
+ parser.add_argument(
44
+ "--models",
45
+ type=str,
46
+ required=True,
47
+ help="Path to onnx models.",
48
+ )
49
+ parser.add_argument(
50
+ "--output-dir",
51
+ type=str,
52
+ default="./output",
53
+ help="Path to infer results.",
54
+ )
55
+
56
+ return parser.parse_args()
57
+
58
+
59
+ def images2video(images, wfp, **kwargs):
60
+ fps = kwargs.get('fps', 30)
61
+ video_format = kwargs.get('format', 'mp4') # default is mp4 format
62
+ codec = kwargs.get('codec', 'libx264') # default is libx264 encoding
63
+ quality = kwargs.get('quality') # video quality
64
+ pixelformat = kwargs.get('pixelformat', 'yuv420p') # video pixel format
65
+ image_mode = kwargs.get('image_mode', 'rgb')
66
+ macro_block_size = kwargs.get('macro_block_size', 2)
67
+ ffmpeg_params = ['-crf', str(kwargs.get('crf', 18))]
68
+
69
+ writer = imageio.get_writer(
70
+ wfp, fps=fps, format=video_format,
71
+ codec=codec, quality=quality, ffmpeg_params=ffmpeg_params, pixelformat=pixelformat, macro_block_size=macro_block_size
72
+ )
73
+
74
+ n = len(images)
75
+ for i in range(n):
76
+ if image_mode.lower() == 'bgr':
77
+ writer.append_data(images[i][..., ::-1])
78
+ else:
79
+ writer.append_data(images[i])
80
+
81
+ writer.close()
82
+
83
+
84
+ def is_template(file_path):
85
+ if file_path.endswith(".pkl"):
86
+ return True
87
+ return False
88
+
89
+
90
+ def has_audio_stream(video_path: str) -> bool:
91
+ """
92
+ Check if the video file contains an audio stream.
93
+
94
+ :param video_path: Path to the video file
95
+ :return: True if the video contains an audio stream, False otherwise
96
+ """
97
+ if osp.isdir(video_path):
98
+ return False
99
+
100
+ cmd = [
101
+ 'ffprobe',
102
+ '-v', 'error',
103
+ '-select_streams', 'a',
104
+ '-show_entries', 'stream=codec_type',
105
+ '-of', 'default=noprint_wrappers=1:nokey=1',
106
+ f'"{video_path}"'
107
+ ]
108
+
109
+ try:
110
+ # result = subprocess.run(cmd, capture_output=True, text=True)
111
+ result = exec_cmd(' '.join(cmd))
112
+ if result.returncode != 0:
113
+ logger.info(f"Error occurred while probing video: {result.stderr}")
114
+ return False
115
+
116
+ # Check if there is any output from ffprobe command
117
+ return bool(result.stdout.strip())
118
+ except Exception as e:
119
+ logger.info(
120
+ f"Error occurred while probing video: {video_path}, "
121
+ "you may need to install ffprobe! (https://ffmpeg.org/download.html) "
122
+ "Now set audio to false!",
123
+ style="bold red"
124
+ )
125
+ return False
126
+
127
+
128
+ def tensor_to_numpy(data: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
129
+ """transform torch.Tensor into numpy.ndarray"""
130
+ if isinstance(data, torch.Tensor):
131
+ return data.data.cpu().numpy()
132
+ return data
133
+
134
+
135
+ def calc_motion_multiplier(
136
+ kp_source: Union[np.ndarray, torch.Tensor],
137
+ kp_driving_initial: Union[np.ndarray, torch.Tensor]
138
+ ) -> float:
139
+ """calculate motion_multiplier based on the source image and the first driving frame"""
140
+ kp_source_np = tensor_to_numpy(kp_source)
141
+ kp_driving_initial_np = tensor_to_numpy(kp_driving_initial)
142
+
143
+ source_area = ConvexHull(kp_source_np.squeeze(0)).volume
144
+ driving_area = ConvexHull(kp_driving_initial_np.squeeze(0)).volume
145
+ motion_multiplier = np.sqrt(source_area) / np.sqrt(driving_area)
146
+ # motion_multiplier = np.cbrt(source_area) / np.cbrt(driving_area)
147
+
148
+ return motion_multiplier
149
+
150
+
151
+ def load_video(video_info, n_frames=-1):
152
+ reader = imageio.get_reader(video_info, "ffmpeg")
153
+
154
+ ret = []
155
+ for idx, frame_rgb in enumerate(reader):
156
+ if n_frames > 0 and idx >= n_frames:
157
+ break
158
+ ret.append(frame_rgb)
159
+
160
+ reader.close()
161
+ return ret
162
+
163
+
164
+ def fast_check_ffmpeg():
165
+ try:
166
+ subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
167
+ return True
168
+ except:
169
+ return False
170
+
171
+
172
+ def is_video(file_path):
173
+ if file_path.lower().endswith((".mp4", ".mov", ".avi", ".webm")) or osp.isdir(file_path):
174
+ return True
175
+ return False
176
+
177
+
178
+ def is_image(file_path):
179
+ image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')
180
+ return file_path.lower().endswith(image_extensions)
181
+
182
+
183
+ def get_fps(filepath, default_fps=25):
184
+ try:
185
+ fps = cv2.VideoCapture(filepath).get(cv2.CAP_PROP_FPS)
186
+
187
+ if fps in (0, None):
188
+ fps = default_fps
189
+ except Exception as e:
190
+ logger.info(e)
191
+ fps = default_fps
192
+
193
+ return fps
194
+
195
+
196
+ def calculate_distance_ratio(lmk: np.ndarray, idx1: int, idx2: int, idx3: int, idx4: int, eps: float = 1e-6) -> np.ndarray:
197
+ return (np.linalg.norm(lmk[:, idx1] - lmk[:, idx2], axis=1, keepdims=True) /
198
+ (np.linalg.norm(lmk[:, idx3] - lmk[:, idx4], axis=1, keepdims=True) + eps))
199
+
200
+
201
+ def calc_eye_close_ratio(lmk: np.ndarray, target_eye_ratio: np.ndarray = None) -> np.ndarray:
202
+ lefteye_close_ratio = calculate_distance_ratio(lmk, 6, 18, 0, 12)
203
+ righteye_close_ratio = calculate_distance_ratio(lmk, 30, 42, 24, 36)
204
+ if target_eye_ratio is not None:
205
+ return np.concatenate([lefteye_close_ratio, righteye_close_ratio, target_eye_ratio], axis=1)
206
+ else:
207
+ return np.concatenate([lefteye_close_ratio, righteye_close_ratio], axis=1)
208
+
209
+
210
+ def calc_lip_close_ratio(lmk: np.ndarray) -> np.ndarray:
211
+ return calculate_distance_ratio(lmk, 90, 102, 48, 66)
212
+
213
+
214
+ def concat_frames(driving_image_lst, source_image_lst, I_p_lst):
215
+ # TODO: add more concat style, e.g., left-down corner driving
216
+ out_lst = []
217
+ h, w, _ = I_p_lst[0].shape
218
+ source_image_resized_lst = [cv2.resize(img, (w, h)) for img in source_image_lst]
219
+
220
+ for idx, _ in enumerate(I_p_lst):
221
+ I_p = I_p_lst[idx]
222
+ source_image_resized = source_image_resized_lst[idx] if len(source_image_lst) > 1 else source_image_resized_lst[0]
223
+
224
+ if driving_image_lst is None:
225
+ out = np.hstack((source_image_resized, I_p))
226
+ else:
227
+ driving_image = driving_image_lst[idx]
228
+ driving_image_resized = cv2.resize(driving_image, (w, h))
229
+ out = np.hstack((driving_image_resized, source_image_resized, I_p))
230
+
231
+ out_lst.append(out)
232
+ return out_lst
233
+
234
+
235
+ def concat_feat(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
236
+ """
237
+ kp_source: (bs, k, 3)
238
+ kp_driving: (bs, k, 3)
239
+ Return: (bs, 2k*3)
240
+ """
241
+ bs_src = kp_source.shape[0]
242
+ bs_dri = kp_driving.shape[0]
243
+ assert bs_src == bs_dri, 'batch size must be equal'
244
+
245
+ feat = torch.cat([kp_source.view(bs_src, -1), kp_driving.view(bs_dri, -1)], dim=1)
246
+ return feat
247
+
248
+
249
+ DTYPE = np.float32
250
+ CV2_INTERP = cv2.INTER_LINEAR
251
+
252
+
253
+ def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
254
+ """ conduct similarity or affine transformation to the image, do not do border operation!
255
+ img:
256
+ M: 2x3 matrix or 3x3 matrix
257
+ dsize: target shape (width, height)
258
+ """
259
+ if isinstance(dsize, tuple) or isinstance(dsize, list):
260
+ _dsize = tuple(dsize)
261
+ else:
262
+ _dsize = (dsize, dsize)
263
+
264
+ if borderMode is not None:
265
+ return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
266
+ else:
267
+ return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
268
+
269
+
270
+ def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
271
+ """prepare mask for later image paste back
272
+ """
273
+ mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
274
+ mask_ori = mask_ori.astype(np.float32) / 255.
275
+ return mask_ori
276
+
277
+
278
+ def paste_back(img_crop, M_c2o, img_ori, mask_ori):
279
+ """paste back the image
280
+ """
281
+ dsize = (img_ori.shape[1], img_ori.shape[0])
282
+ result = _transform_img(img_crop, M_c2o, dsize=dsize)
283
+ result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
284
+ return result
285
+
286
+
287
+ def prefix(filename):
288
+ """a.jpg -> a"""
289
+ pos = filename.rfind(".")
290
+ if pos == -1:
291
+ return filename
292
+ return filename[:pos]
293
+
294
+
295
+ def basename(filename):
296
+ """a/b/c.jpg -> c"""
297
+ return prefix(osp.basename(filename))
298
+
299
+
300
+ def mkdir(d, log=False):
301
+ # return self-assined `d`, for one line code
302
+ if not osp.exists(d):
303
+ os.makedirs(d, exist_ok=True)
304
+ if log:
305
+ logger.info(f"Make dir: {d}")
306
+ return d
307
+
308
+
309
+ def dct2device(dct: dict, device):
310
+ for key in dct:
311
+ if isinstance(dct[key], torch.Tensor):
312
+ dct[key] = dct[key].to(device)
313
+ else:
314
+ dct[key] = torch.tensor(dct[key]).to(device)
315
+ return dct
316
+
317
+
318
+ PI = np.pi
319
+
320
+ def headpose_pred_to_degree(pred):
321
+ """
322
+ pred: (bs, 66) or (bs, 1) or others
323
+ """
324
+ if pred.ndim > 1 and pred.shape[1] == 66:
325
+ # NOTE: note that the average is modified to 97.5
326
+ device = pred.device
327
+ idx_tensor = [idx for idx in range(0, 66)]
328
+ idx_tensor = torch.FloatTensor(idx_tensor).to(device)
329
+ pred = F.softmax(pred, dim=1)
330
+ degree = torch.sum(pred*idx_tensor, axis=1) * 3 - 97.5
331
+
332
+ return degree
333
+
334
+ return pred
335
+
336
+
337
+ def get_rotation_matrix(pitch_, yaw_, roll_):
338
+ """ the input is in degree
339
+ """
340
+ # transform to radian
341
+ pitch = pitch_ / 180 * PI
342
+ yaw = yaw_ / 180 * PI
343
+ roll = roll_ / 180 * PI
344
+
345
+ device = pitch.device
346
+
347
+ if pitch.ndim == 1:
348
+ pitch = pitch.unsqueeze(1)
349
+ if yaw.ndim == 1:
350
+ yaw = yaw.unsqueeze(1)
351
+ if roll.ndim == 1:
352
+ roll = roll.unsqueeze(1)
353
+
354
+ # calculate the euler matrix
355
+ bs = pitch.shape[0]
356
+ ones = torch.ones([bs, 1]).to(device)
357
+ zeros = torch.zeros([bs, 1]).to(device)
358
+ x, y, z = pitch, yaw, roll
359
+
360
+ rot_x = torch.cat([
361
+ ones, zeros, zeros,
362
+ zeros, torch.cos(x), -torch.sin(x),
363
+ zeros, torch.sin(x), torch.cos(x)
364
+ ], dim=1).reshape([bs, 3, 3])
365
+
366
+ rot_y = torch.cat([
367
+ torch.cos(y), zeros, torch.sin(y),
368
+ zeros, ones, zeros,
369
+ -torch.sin(y), zeros, torch.cos(y)
370
+ ], dim=1).reshape([bs, 3, 3])
371
+
372
+ rot_z = torch.cat([
373
+ torch.cos(z), -torch.sin(z), zeros,
374
+ torch.sin(z), torch.cos(z), zeros,
375
+ zeros, zeros, ones
376
+ ], dim=1).reshape([bs, 3, 3])
377
+
378
+ rot = rot_z @ rot_y @ rot_x
379
+ return rot.permute(0, 2, 1) # transpose
380
+
381
+
382
+ def suffix(filename):
383
+ """a.jpg -> jpg"""
384
+ pos = filename.rfind(".")
385
+ if pos == -1:
386
+ return ""
387
+ return filename[pos + 1:]
388
+
389
+
390
+ def remove_suffix(filepath):
391
+ """a/b/c.jpg -> a/b/c"""
392
+ return osp.join(osp.dirname(filepath), basename(filepath))
393
+
394
+
395
+ def load(fp):
396
+ suffix_ = suffix(fp)
397
+
398
+ if suffix_ == "npy":
399
+ return np.load(fp)
400
+ elif suffix_ == "pkl":
401
+ return pkl.load(open(fp, "rb"))
402
+ else:
403
+ raise Exception(f"Unknown type: {suffix}")
404
+
405
+
406
+ def dump(wfp, obj):
407
+ wd = osp.split(wfp)[0]
408
+ if wd != "" and not osp.exists(wd):
409
+ mkdir(wd)
410
+
411
+ _suffix = suffix(wfp)
412
+ if _suffix == "npy":
413
+ np.save(wfp, obj)
414
+ elif _suffix == "pkl":
415
+ pkl.dump(obj, open(wfp, "wb"))
416
+ else:
417
+ raise Exception("Unknown type: {}".format(_suffix))
418
+
419
+
420
+ def make_abs_path(fn):
421
+ return osp.join(osp.dirname(osp.realpath(__file__)), fn)
422
+
423
+
424
+ def load_image_rgb(image_path: str):
425
+ if not osp.exists(image_path):
426
+ raise FileNotFoundError(f"Image not found: {image_path}")
427
+ img = cv2.imread(image_path, cv2.IMREAD_COLOR)
428
+ return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
429
+
430
+
431
+ def resize_to_limit(img: np.ndarray, max_dim=1920, division=2):
432
+ """
433
+ ajust the size of the image so that the maximum dimension does not exceed max_dim, and the width and the height of the image are multiples of n.
434
+ :param img: the image to be processed.
435
+ :param max_dim: the maximum dimension constraint.
436
+ :param n: the number that needs to be multiples of.
437
+ :return: the adjusted image.
438
+ """
439
+ h, w = img.shape[:2]
440
+
441
+ # ajust the size of the image according to the maximum dimension
442
+ if max_dim > 0 and max(h, w) > max_dim:
443
+ if h > w:
444
+ new_h = max_dim
445
+ new_w = int(w * (max_dim / h))
446
+ else:
447
+ new_w = max_dim
448
+ new_h = int(h * (max_dim / w))
449
+ img = cv2.resize(img, (new_w, new_h))
450
+
451
+ # ensure that the image dimensions are multiples of n
452
+ division = max(division, 1)
453
+ new_h = img.shape[0] - (img.shape[0] % division)
454
+ new_w = img.shape[1] - (img.shape[1] % division)
455
+
456
+ if new_h == 0 or new_w == 0:
457
+ # when the width or height is less than n, no need to process
458
+ return img
459
+
460
+ if new_h != img.shape[0] or new_w != img.shape[1]:
461
+ img = img[:new_h, :new_w]
462
+
463
+ return img
464
+
465
+
466
+ def preprocess(input_data):
467
+ img_rgb = load_image_rgb(input_data)
468
+ img_rgb = resize_to_limit(img_rgb)
469
+ return [img_rgb]
470
+
471
+
472
+ def postprocess(output_data):
473
+ # Implement your postprocessing steps here
474
+ # For example, you might convert the output to a specific format
475
+ return output_data
476
+
477
+
478
+ def infer(model, input_data):
479
+ input_name = model.get_inputs()[0].name
480
+ output_name = model.get_outputs()[0].name
481
+ input_data = preprocess(input_data) # rgb, resize & limit
482
+ result = model.run([output_name], {input_name: input_data})
483
+ return postprocess(result)
484
+
485
+
486
+ def partial_fields(target_class, kwargs):
487
+ return target_class(**{k: v for k, v in kwargs.items() if hasattr(target_class, k)})
488
+
489
+
490
+ def calc_ratio(lmk_lst):
491
+ input_eye_ratio_lst = []
492
+ input_lip_ratio_lst = []
493
+ for lmk in lmk_lst:
494
+ # for eyes retargeting
495
+ input_eye_ratio_lst.append(calc_eye_close_ratio(lmk[None]))
496
+ # for lip retargeting
497
+ input_lip_ratio_lst.append(calc_lip_close_ratio(lmk[None]))
498
+ return input_eye_ratio_lst, input_lip_ratio_lst
499
+
500
+
501
+ def prepare_videos(imgs) -> torch.Tensor:
502
+ """ construct the input as standard
503
+ imgs: NxBxHxWx3, uint8
504
+ """
505
+ device = "cpu"
506
+ if isinstance(imgs, list):
507
+ _imgs = np.array(imgs)[..., np.newaxis] # TxHxWx3x1
508
+ elif isinstance(imgs, np.ndarray):
509
+ _imgs = imgs
510
+ else:
511
+ raise ValueError(f'imgs type error: {type(imgs)}')
512
+
513
+ y = _imgs.astype(np.float32) / 255.
514
+ y = np.clip(y, 0, 1) # clip to 0~1
515
+ y = torch.from_numpy(y).permute(0, 4, 3, 1, 2) # TxHxWx3x1 -> Tx1x3xHxW
516
+ y = y.to(device)
517
+
518
+ return y
519
+
520
+
521
+ def get_kp_info(x: torch.Tensor) -> dict:
522
+ """ get the implicit keypoint information
523
+ x: Bx3xHxW, normalized to 0~1
524
+ flag_refine_info: whether to trandform the pose to degrees and the dimention of the reshape
525
+ return: A dict contains keys: 'pitch', 'yaw', 'roll', 't', 'exp', 'scale', 'kp'
526
+ """
527
+ outs = motion_extractor.run([], input_feed={"input": x.numpy()}) # TODO: axengine 中的 run 输入参数与 ort 还是些许不同
528
+ kp_info = {}
529
+ kp_info['pitch'] = torch.from_numpy(outs[0])
530
+ kp_info['yaw'] = torch.from_numpy(outs[1])
531
+ kp_info['roll'] = torch.from_numpy(outs[2])
532
+ kp_info['t'] = torch.from_numpy(outs[3])
533
+ kp_info['exp'] = torch.from_numpy(outs[4])
534
+ kp_info['scale'] = torch.from_numpy(outs[5])
535
+ kp_info['kp'] = torch.from_numpy(outs[6])
536
+
537
+ flag_refine_info: bool = True
538
+ if flag_refine_info:
539
+ bs = kp_info['kp'].shape[0]
540
+ kp_info['pitch'] = headpose_pred_to_degree(kp_info['pitch'])[:, None] # Bx1
541
+ kp_info['yaw'] = headpose_pred_to_degree(kp_info['yaw'])[:, None] # Bx1
542
+ kp_info['roll'] = headpose_pred_to_degree(kp_info['roll'])[:, None] # Bx1
543
+ kp_info['kp'] = kp_info['kp'].reshape(bs, -1, 3) # BxNx3
544
+ kp_info['exp'] = kp_info['exp'].reshape(bs, -1, 3) # BxNx3
545
+
546
+ return kp_info
547
+
548
+
549
+ def transform_keypoint(kp_info: dict):
550
+ """
551
+ transform the implicit keypoints with the pose, shift, and expression deformation
552
+ kp: BxNx3
553
+ """
554
+ kp = kp_info['kp'] # (bs, k, 3)
555
+ pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
556
+
557
+ t, exp = kp_info['t'], kp_info['exp']
558
+ scale = kp_info['scale']
559
+ pitch = headpose_pred_to_degree(pitch)
560
+ yaw = headpose_pred_to_degree(yaw)
561
+ roll = headpose_pred_to_degree(roll)
562
+
563
+ bs = kp.shape[0]
564
+ if kp.ndim == 2:
565
+ num_kp = kp.shape[1] // 3 # Bx(num_kpx3)
566
+ else:
567
+ num_kp = kp.shape[1] # Bxnum_kpx3
568
+
569
+ rot_mat = get_rotation_matrix(pitch, yaw, roll) # (bs, 3, 3), 欧拉角转换为旋转矩阵
570
+
571
+ # Eqn.2: s * (R * x_c,s + exp) + t
572
+ kp_transformed = kp.view(bs, num_kp, 3) @ rot_mat + exp.view(bs, num_kp, 3)
573
+ kp_transformed *= scale[..., None] # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
574
+ kp_transformed[:, :, 0:2] += t[:, None, 0:2] # remove z, only apply tx ty
575
+
576
+ return kp_transformed
577
+
578
+
579
+ def make_motion_template(I_lst, c_eyes_lst, c_lip_lst, **kwargs):
580
+ n_frames = I_lst.shape[0]
581
+ template_dct = {
582
+ 'n_frames': n_frames,
583
+ 'output_fps': kwargs.get('output_fps', 25),
584
+ 'motion': [],
585
+ 'c_eyes_lst': [],
586
+ 'c_lip_lst': [],
587
+ }
588
+
589
+ for i in range(n_frames):
590
+ # collect s, R, δ and t for inference
591
+ I_i = I_lst[i]
592
+ x_i_info = get_kp_info(I_i)
593
+ x_s = transform_keypoint(x_i_info)
594
+ R_i = get_rotation_matrix(x_i_info['pitch'], x_i_info['yaw'], x_i_info['roll'])
595
+
596
+ item_dct = {
597
+ 'scale': x_i_info['scale'].cpu().numpy().astype(np.float32),
598
+ 'R': R_i.cpu().numpy().astype(np.float32),
599
+ 'exp': x_i_info['exp'].cpu().numpy().astype(np.float32),
600
+ 't': x_i_info['t'].cpu().numpy().astype(np.float32),
601
+ 'kp': x_i_info['kp'].cpu().numpy().astype(np.float32),
602
+ 'x_s': x_s.cpu().numpy().astype(np.float32),
603
+ }
604
+
605
+ template_dct['motion'].append(item_dct)
606
+
607
+ c_eyes = c_eyes_lst[i].astype(np.float32)
608
+ template_dct['c_eyes_lst'].append(c_eyes)
609
+
610
+ c_lip = c_lip_lst[i].astype(np.float32)
611
+ template_dct['c_lip_lst'].append(c_lip)
612
+
613
+ return template_dct
614
+
615
+
616
+ def prepare_source(img: np.ndarray) -> torch.Tensor:
617
+ """ construct the input as standard
618
+ img: HxWx3, uint8, 256x256
619
+ """
620
+ device = "cpu"
621
+ h, w = img.shape[:2]
622
+ x = img.copy()
623
+
624
+ if x.ndim == 3:
625
+ x = x[np.newaxis].astype(np.float32) / 255. # HxWx3 -> 1xHxWx3, normalized to 0~1
626
+ elif x.ndim == 4:
627
+ x = x.astype(np.float32) / 255. # BxHxWx3, normalized to 0~1
628
+ else:
629
+ raise ValueError(f'img ndim should be 3 or 4: {x.ndim}')
630
+ x = np.clip(x, 0, 1) # clip to 0~1
631
+ x = torch.from_numpy(x).permute(0, 3, 1, 2) # 1xHxWx3 -> 1x3xHxW
632
+ x = x.to(device)
633
+ return x
634
+
635
+
636
+ def extract_feature_3d(x: torch.Tensor) -> torch.Tensor:
637
+ """ get the appearance feature of the image by F
638
+ x: Bx3xHxW, normalized to 0~1
639
+ """
640
+ outs = appearance_feature_extractor.run([], input_feed={"input": x.numpy()})[0]
641
+ return torch.from_numpy(outs)
642
+
643
+
644
+ def stitch(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
645
+ """
646
+ kp_source: BxNx3
647
+ kp_driving: BxNx3
648
+ Return: Bx(3*num_kp+2)
649
+ """
650
+ feat_stiching = concat_feat(kp_source, kp_driving)
651
+ delta = stitching_retargeting_module.run([], input_feed={"input": feat_stiching.numpy()})[0]
652
+ return torch.from_numpy(delta)
653
+
654
+
655
+ def stitching(kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
656
+ """ conduct the stitching
657
+ kp_source: Bxnum_kpx3
658
+ kp_driving: Bxnum_kpx3
659
+ """
660
+
661
+ bs, num_kp = kp_source.shape[:2]
662
+
663
+ kp_driving_new = kp_driving.clone()
664
+ delta = stitch(kp_source, kp_driving_new)
665
+
666
+ delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3) # 1x20x3
667
+ delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2) # 1x1x2
668
+
669
+ kp_driving_new += delta_exp
670
+ kp_driving_new[..., :2] += delta_tx_ty
671
+
672
+ return kp_driving_new
673
+
674
+
675
+ def warp_decode(feature_3d: torch.Tensor, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
676
+ """ get the image after the warping of the implicit keypoints
677
+ feature_3d: Bx32x16x64x64, feature volume
678
+ kp_source: BxNx3
679
+ kp_driving: BxNx3
680
+ """
681
+ outs = warping_module.run([], {"feature_3d": feature_3d.numpy(), "kp_driving": kp_driving.numpy(), "kp_source": kp_source.numpy()})[2]
682
+ outs = spade_generator.run([], input_feed={"input": outs})[0]
683
+ ret_dct = {}
684
+ ret_dct['out'] = torch.from_numpy(outs)
685
+ return ret_dct
686
+
687
+
688
+ def parse_output(out: torch.Tensor) -> np.ndarray:
689
+ """ construct the output as standard
690
+ return: 1xHxWx3, uint8
691
+ """
692
+ out = np.transpose(out.data.cpu().numpy(), [0, 2, 3, 1]) # 1x3xHxW -> 1xHxWx3
693
+ out = np.clip(out, 0, 1) # clip to 0~1
694
+ out = np.clip(out * 255, 0, 255).astype(np.uint8) # 0~1 -> 0~255
695
+
696
+ return out
697
+
698
+
699
+ def load_model(model_type, model_path=None):
700
+ if model_type == 'appearance_feature_extractor':
701
+ model = ort.InferenceSession(f"{model_path}/feature_extractor.onnx", providers=["CPUExecutionProvider"])
702
+ elif model_type == 'motion_extractor':
703
+ model = ort.InferenceSession(f'{model_path}/motion_extractor.onnx', providers=["CPUExecutionProvider"])
704
+ elif model_type == 'warping_module':
705
+ model = ort.InferenceSession(f'{model_path}/warp.onnx', providers=["CPUExecutionProvider"])
706
+ elif model_type == 'spade_generator':
707
+ model = ort.InferenceSession(f'{model_path}/spade_generator.onnx', providers=["CPUExecutionProvider"])
708
+ elif model_type == 'stitching_retargeting_module':
709
+ model = ort.InferenceSession(f'{model_path}/stitching_retargeting.onnx', providers=["CPUExecutionProvider"])
710
+ return model
711
+
712
+
713
+ def main():
714
+ args = parse_args()
715
+
716
+ global appearance_feature_extractor
717
+ appearance_feature_extractor = load_model("appearance_feature_extractor", args.models)
718
+
719
+ global motion_extractor
720
+ motion_extractor = load_model("motion_extractor", args.models)
721
+
722
+ global warping_module
723
+ warping_module = load_model("warping_module", args.models)
724
+
725
+ global spade_generator
726
+ spade_generator = load_model("spade_generator", args.models)
727
+
728
+ global stitching_retargeting_module
729
+ stitching_retargeting_module = load_model("stitching_retargeting_module", args.models)
730
+
731
+ source = args.source
732
+ driving = args.driving
733
+
734
+ ffmpeg_dir = os.path.join(os.getcwd(), "ffmpeg")
735
+ if osp.exists(ffmpeg_dir):
736
+ os.environ["PATH"] += (os.pathsep + ffmpeg_dir)
737
+
738
+ if not fast_check_ffmpeg():
739
+ raise ImportError(
740
+ "FFmpeg is not installed. Please install FFmpeg (including ffmpeg and ffprobe) before running this script. https://ffmpeg.org/download.html"
741
+ )
742
+
743
+ source_rgb_lst = preprocess(source) # rgb, resize & limit
744
+ ######## process driving info ########
745
+ flag_load_from_template = is_template(args.driving)
746
+ driving_rgb_crop_256x256_lst = None
747
+ wfp_template = None
748
+ device = "cpu"
749
+ flag_is_source_video = False
750
+ cropper: Cropper = Cropper()
751
+
752
+ if flag_load_from_template:
753
+ # NOTE: load from template, it is fast, but the cropping video is None
754
+ logger.info(f"Load from template: {args.driving}, NOT the video, so the cropping video and audio are both NULL.", style='bold green')
755
+ driving_template_dct = load(args.driving)
756
+ c_d_eyes_lst = driving_template_dct['c_eyes_lst'] if 'c_eyes_lst' in driving_template_dct.keys() else driving_template_dct['c_d_eyes_lst'] # compatible with previous keys
757
+ c_d_lip_lst = driving_template_dct['c_lip_lst'] if 'c_lip_lst' in driving_template_dct.keys() else driving_template_dct['c_d_lip_lst']
758
+ driving_n_frames = driving_template_dct['n_frames']
759
+ flag_is_driving_video = True if driving_n_frames > 1 else False
760
+ if flag_is_source_video and flag_is_driving_video:
761
+ n_frames = min(len(source_rgb_lst), driving_n_frames) # minimum number as the number of the animated frames
762
+ elif flag_is_source_video and not flag_is_driving_video:
763
+ n_frames = len(source_rgb_lst)
764
+ else:
765
+ n_frames = driving_n_frames
766
+ # set output_fps
767
+ output_fps = driving_template_dct.get('output_fps', 25)
768
+ logger.info(f'The FPS of template: {output_fps}')
769
+ flag_crop_driving_video = False
770
+ if flag_crop_driving_video:
771
+ logger.info("Warning: flag_crop_driving_video is True, but the driving info is a template, so it is ignored.")
772
+ elif osp.exists(args.driving):
773
+ if is_video(args.driving):
774
+ flag_is_driving_video = True
775
+ # load from video file, AND make motion template
776
+ output_fps = int(get_fps(args.driving))
777
+ driving_rgb_lst = load_video(args.driving)
778
+ elif is_image(args.driving):
779
+ flag_is_driving_video = False
780
+ output_fps = 25
781
+ driving_rgb_lst = [load_image_rgb(driving)] # rgb
782
+ else:
783
+ raise Exception(f"{args.driving} is not a supported type!")
784
+ ######## make motion template ########
785
+ logger.info("Start making driving motion template...")
786
+ driving_n_frames = len(driving_rgb_lst)
787
+ n_frames = driving_n_frames
788
+ driving_lmk_crop_lst = cropper.calc_lmks_from_cropped_video(driving_rgb_lst) # cropper.
789
+ driving_rgb_crop_256x256_lst = [cv2.resize(_, (256, 256)) for _ in driving_rgb_lst] # force to resize to 256x256
790
+ #######################################
791
+ c_d_eyes_lst, c_d_lip_lst = calc_ratio(driving_lmk_crop_lst)
792
+ # save the motion template
793
+ I_d_lst = prepare_videos(driving_rgb_crop_256x256_lst)
794
+
795
+ driving_template_dct = make_motion_template(I_d_lst, c_d_eyes_lst, c_d_lip_lst, output_fps=output_fps)
796
+ wfp_template = remove_suffix(args.driving) + '.pkl'
797
+ dump(wfp_template, driving_template_dct)
798
+ logger.info(f"Dump motion template to {wfp_template}")
799
+ else:
800
+ raise Exception(f"{args.driving} does not exist!")
801
+
802
+ if not flag_is_driving_video:
803
+ c_d_eyes_lst = c_d_eyes_lst * n_frames
804
+ c_d_lip_lst = c_d_lip_lst * n_frames
805
+
806
+ I_p_pstbk_lst = []
807
+ logger.info("Prepared pasteback mask done.")
808
+
809
+ I_p_lst = []
810
+ R_d_0, x_d_0_info = None, None
811
+ flag_normalize_lip = False # inf_cfg.flag_normalize_lip # not overwrite
812
+ flag_source_video_eye_retargeting = False # inf_cfg.flag_source_video_eye_retargeting # not overwrite
813
+ lip_delta_before_animation, eye_delta_before_animation = None, None
814
+
815
+ ######## process source info ########
816
+ # if the input is a source image, process it only once
817
+ flag_do_crop = True
818
+ if flag_do_crop:
819
+ crop_info = cropper.crop_source_image(source_rgb_lst[0])
820
+ if crop_info is None:
821
+ raise Exception("No face detected in the source image!")
822
+ source_lmk = crop_info['lmk_crop']
823
+ img_crop_256x256 = crop_info['img_crop_256x256']
824
+ else:
825
+ source_lmk = cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
826
+ img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256)) # force to resize to 256x256
827
+
828
+ I_s = prepare_source(img_crop_256x256)
829
+ x_s_info = get_kp_info(I_s)
830
+ x_c_s = x_s_info['kp']
831
+ R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
832
+ f_s = extract_feature_3d(I_s)
833
+ x_s = transform_keypoint(x_s_info)
834
+
835
+ # let lip-open scalar to be 0 at first
836
+ mask_crop: ndarray = cv2.imread(make_abs_path('./utils/resources/mask_template.png'), cv2.IMREAD_COLOR)
837
+ mask_ori_float = prepare_paste_back(mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))
838
+
839
+ with open(make_abs_path('./utils/resources/lip_array.pkl'), 'rb') as f:
840
+ lip_array = pkl.load(f)
841
+ ######## animate ########
842
+ if flag_is_driving_video: # or (flag_is_source_video and not flag_is_driving_video)
843
+ logger.info(f"The animated video consists of {n_frames} frames.")
844
+ else:
845
+ logger.info(f"The output of image-driven portrait animation is an image.")
846
+ for i in range(n_frames):
847
+ x_d_i_info = driving_template_dct['motion'][i]
848
+ x_d_i_info = dct2device(x_d_i_info, device)
849
+ R_d_i = x_d_i_info['R'] if 'R' in x_d_i_info.keys() else x_d_i_info['R_d'] # compatible with previous keys
850
+
851
+ if i == 0: # cache the first frame
852
+ R_d_0 = R_d_i
853
+ x_d_0_info = x_d_i_info.copy()
854
+
855
+ delta_new = x_s_info['exp'].clone()
856
+ R_new = x_d_r_lst_smooth[i] if flag_is_source_video else (R_d_i @ R_d_0.permute(0, 2, 1)) @ R_s
857
+ if flag_is_driving_video:
858
+ delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - x_d_0_info['exp'])
859
+ else:
860
+ delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
861
+ # delta_new = x_s_info['exp'] + (x_d_i_info['exp'] - torch.from_numpy(lip_array).to(dtype=torch.float32, device=device))
862
+ scale_new = x_s_info['scale'] if flag_is_source_video else x_s_info['scale'] * (x_d_i_info['scale'] / x_d_0_info['scale'])
863
+ t_new = x_s_info['t'] if flag_is_source_video else x_s_info['t'] + (x_d_i_info['t'] - x_d_0_info['t'])
864
+ t_new[..., 2].fill_(0) # zero tz
865
+ x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new
866
+
867
+ if i == 0 and flag_is_driving_video:
868
+ x_d_0_new = x_d_i_new
869
+ motion_multiplier = calc_motion_multiplier(x_s, x_d_0_new)
870
+ # motion_multiplier *= inf_cfg.driving_multiplier
871
+ x_d_diff = (x_d_i_new - x_d_0_new) * motion_multiplier
872
+ x_d_i_new = x_d_diff + x_s
873
+
874
+ # Algorithm 1:
875
+ # with stitching and without retargeting
876
+ x_d_i_new = stitching(x_s, x_d_i_new)
877
+ x_d_i_new = x_s + (x_d_i_new - x_s) * 1.0
878
+ out = warp_decode(f_s, x_s, x_d_i_new)
879
+ I_p_i = parse_output(out['out'])[0]
880
+ I_p_lst.append(I_p_i)
881
+ I_p_pstbk = paste_back(I_p_i, crop_info['M_c2o'], source_rgb_lst[0], mask_ori_float)
882
+ I_p_pstbk_lst.append(I_p_pstbk)
883
+
884
+ mkdir(args.output_dir)
885
+ wfp_concat = None
886
+ ######### build the final concatenation result #########
887
+ # driving frame | source frame | generation
888
+ frames_concatenated = concat_frames(driving_rgb_crop_256x256_lst, [img_crop_256x256], I_p_lst)
889
+
890
+ if flag_is_driving_video or (flag_is_source_video and not flag_is_driving_video):
891
+ flag_source_has_audio = flag_is_source_video and has_audio_stream(args.source)
892
+ flag_driving_has_audio = has_audio_stream(args.driving)
893
+
894
+ wfp_concat = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat.mp4')
895
+
896
+ # NOTE: update output fps
897
+ output_fps = source_fps if flag_is_source_video else output_fps
898
+ images2video(frames_concatenated, wfp=wfp_concat, fps=output_fps)
899
+
900
+ if flag_source_has_audio or flag_driving_has_audio:
901
+ # final result with concatenation
902
+ wfp_concat_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_concat_with_audio.mp4')
903
+ audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
904
+ logger.info(f"Audio is selected from {audio_from_which_video}, concat mode")
905
+ add_audio_to_video(wfp_concat, audio_from_which_video, wfp_concat_with_audio)
906
+ os.replace(wfp_concat_with_audio, wfp_concat)
907
+ logger.info(f"Replace {wfp_concat_with_audio} with {wfp_concat}")
908
+
909
+ # save the animated result
910
+ wfp = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}.mp4')
911
+ if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
912
+ images2video(I_p_pstbk_lst, wfp=wfp, fps=output_fps)
913
+ else:
914
+ images2video(I_p_lst, wfp=wfp, fps=output_fps)
915
+
916
+ ######### build the final result #########
917
+ if flag_source_has_audio or flag_driving_has_audio:
918
+ wfp_with_audio = osp.join(args.output_dir, f'{basename(args.source)}--{basename(args.driving)}_with_audio.mp4')
919
+ audio_from_which_video = args.driving if ((flag_driving_has_audio and args.audio_priority == 'driving') or (not flag_source_has_audio)) else args.source
920
+ logger.info(f"Audio is selected from {audio_from_which_video}")
921
+ add_audio_to_video(wfp, audio_from_which_video, wfp_with_audio)
922
+ os.replace(wfp_with_audio, wfp)
923
+ logger.info(f"Replace {wfp_with_audio} with {wfp}")
924
+
925
+ # final log
926
+ if wfp_template not in (None, ''):
927
+ logger.info(f'Animated template: {wfp_template}, you can specify `-d` argument with this template path next time to avoid cropping video, motion making and protecting privacy.', style='bold green')
928
+ logger.info(f'Animated video: {wfp}')
929
+ logger.info(f'Animated video with concat: {wfp_concat}')
930
+ else:
931
+ wfp_concat = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}_concat.jpg')
932
+ cv2.imwrite(wfp_concat, frames_concatenated[0][..., ::-1])
933
+ wfp = osp.join(args.output_dir, f'{basename(source)}--{basename(driving)}.jpg')
934
+ if I_p_pstbk_lst is not None and len(I_p_pstbk_lst) > 0:
935
+ cv2.imwrite(wfp, I_p_pstbk_lst[0][..., ::-1])
936
+ else:
937
+ cv2.imwrite(wfp, frames_concatenated[0][..., ::-1])
938
+ # final log
939
+ logger.info(f'Animated image: {wfp}')
940
+ logger.info(f'Animated image with concat: {wfp_concat}')
941
+
942
+
943
+ if __name__ == "__main__":
944
+ """
945
+ Usage:
946
+ python3 infer_onnx.py --source ../assets/examples/source/s0.jpg --driving ../assets/examples/driving/d8.jpg --models onnx-models --output-dir output
947
+ """
948
+ timer = Timer()
949
+ timer.tic()
950
+ main()
951
+ elapse = timer.toc()
952
+ logger.debug(f'LivePortrait onnx infer time: {elapse:.3f}s')
python/requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ onnx
2
+ onnxruntime
3
+ opencv-python
4
+ torch
5
+ torchvision
6
+ numpy
7
+ loguru
8
+ imageio[ffmpeg]
9
+ ffprobe-python
python/utils/__init__.py ADDED
File without changes
python/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (156 Bytes). View file
 
python/utils/__pycache__/crop.cpython-310.pyc ADDED
Binary file (10 kB). View file
 
python/utils/__pycache__/human_landmark_runner.cpython-310.pyc ADDED
Binary file (2.87 kB). View file
 
python/utils/__pycache__/rprint.cpython-310.pyc ADDED
Binary file (368 Bytes). View file
 
python/utils/__pycache__/timer.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
python/utils/crop.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+
3
+ """
4
+ cropping function and the related preprocess functions for cropping
5
+ """
6
+
7
+ import numpy as np
8
+ import os.path as osp
9
+ from math import sin, cos, acos, degrees
10
+ import cv2; cv2.setNumThreads(0); cv2.ocl.setUseOpenCL(False) # NOTE: enforce single thread
11
+ from .rprint import rprint as print
12
+
13
+ DTYPE = np.float32
14
+ CV2_INTERP = cv2.INTER_LINEAR
15
+
16
+ def make_abs_path(fn):
17
+ return osp.join(osp.dirname(osp.realpath(__file__)), fn)
18
+
19
+ def _transform_img(img, M, dsize, flags=CV2_INTERP, borderMode=None):
20
+ """ conduct similarity or affine transformation to the image, do not do border operation!
21
+ img:
22
+ M: 2x3 matrix or 3x3 matrix
23
+ dsize: target shape (width, height)
24
+ """
25
+ if isinstance(dsize, tuple) or isinstance(dsize, list):
26
+ _dsize = tuple(dsize)
27
+ else:
28
+ _dsize = (dsize, dsize)
29
+
30
+ if borderMode is not None:
31
+ return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags, borderMode=borderMode, borderValue=(0, 0, 0))
32
+ else:
33
+ return cv2.warpAffine(img, M[:2, :], dsize=_dsize, flags=flags)
34
+
35
+
36
+ def _transform_pts(pts, M):
37
+ """ conduct similarity or affine transformation to the pts
38
+ pts: Nx2 ndarray
39
+ M: 2x3 matrix or 3x3 matrix
40
+ return: Nx2
41
+ """
42
+ return pts @ M[:2, :2].T + M[:2, 2]
43
+
44
+
45
+ def parse_pt2_from_pt101(pt101, use_lip=True):
46
+ """
47
+ parsing the 2 points according to the 101 points, which cancels the roll
48
+ """
49
+ # the former version use the eye center, but it is not robust, now use interpolation
50
+ pt_left_eye = np.mean(pt101[[39, 42, 45, 48]], axis=0) # left eye center
51
+ pt_right_eye = np.mean(pt101[[51, 54, 57, 60]], axis=0) # right eye center
52
+
53
+ if use_lip:
54
+ # use lip
55
+ pt_center_eye = (pt_left_eye + pt_right_eye) / 2
56
+ pt_center_lip = (pt101[75] + pt101[81]) / 2
57
+ pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
58
+ else:
59
+ pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
60
+ return pt2
61
+
62
+
63
+ def parse_pt2_from_pt106(pt106, use_lip=True):
64
+ """
65
+ parsing the 2 points according to the 106 points, which cancels the roll
66
+ """
67
+ pt_left_eye = np.mean(pt106[[33, 35, 40, 39]], axis=0) # left eye center
68
+ pt_right_eye = np.mean(pt106[[87, 89, 94, 93]], axis=0) # right eye center
69
+
70
+ if use_lip:
71
+ # use lip
72
+ pt_center_eye = (pt_left_eye + pt_right_eye) / 2
73
+ pt_center_lip = (pt106[52] + pt106[61]) / 2
74
+ pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
75
+ else:
76
+ pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
77
+ return pt2
78
+
79
+
80
+ def parse_pt2_from_pt203(pt203, use_lip=True):
81
+ """
82
+ parsing the 2 points according to the 203 points, which cancels the roll
83
+ """
84
+ pt_left_eye = np.mean(pt203[[0, 6, 12, 18]], axis=0) # left eye center
85
+ pt_right_eye = np.mean(pt203[[24, 30, 36, 42]], axis=0) # right eye center
86
+ if use_lip:
87
+ # use lip
88
+ pt_center_eye = (pt_left_eye + pt_right_eye) / 2
89
+ pt_center_lip = (pt203[48] + pt203[66]) / 2
90
+ pt2 = np.stack([pt_center_eye, pt_center_lip], axis=0)
91
+ else:
92
+ pt2 = np.stack([pt_left_eye, pt_right_eye], axis=0)
93
+ return pt2
94
+
95
+
96
+ def parse_pt2_from_pt68(pt68, use_lip=True):
97
+ """
98
+ parsing the 2 points according to the 68 points, which cancels the roll
99
+ """
100
+ lm_idx = np.array([31, 37, 40, 43, 46, 49, 55], dtype=np.int32) - 1
101
+ if use_lip:
102
+ pt5 = np.stack([
103
+ np.mean(pt68[lm_idx[[1, 2]], :], 0), # left eye
104
+ np.mean(pt68[lm_idx[[3, 4]], :], 0), # right eye
105
+ pt68[lm_idx[0], :], # nose
106
+ pt68[lm_idx[5], :], # lip
107
+ pt68[lm_idx[6], :] # lip
108
+ ], axis=0)
109
+
110
+ pt2 = np.stack([
111
+ (pt5[0] + pt5[1]) / 2,
112
+ (pt5[3] + pt5[4]) / 2
113
+ ], axis=0)
114
+ else:
115
+ pt2 = np.stack([
116
+ np.mean(pt68[lm_idx[[1, 2]], :], 0), # left eye
117
+ np.mean(pt68[lm_idx[[3, 4]], :], 0), # right eye
118
+ ], axis=0)
119
+
120
+ return pt2
121
+
122
+
123
+ def parse_pt2_from_pt5(pt5, use_lip=True):
124
+ """
125
+ parsing the 2 points according to the 5 points, which cancels the roll
126
+ """
127
+ if use_lip:
128
+ pt2 = np.stack([
129
+ (pt5[0] + pt5[1]) / 2,
130
+ (pt5[3] + pt5[4]) / 2
131
+ ], axis=0)
132
+ else:
133
+ pt2 = np.stack([
134
+ pt5[0],
135
+ pt5[1]
136
+ ], axis=0)
137
+ return pt2
138
+
139
+ def parse_pt2_from_pt9(pt9, use_lip=True):
140
+ '''
141
+ parsing the 2 points according to the 9 points, which cancels the roll
142
+ ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip']
143
+ '''
144
+ if use_lip:
145
+ pt9 = np.stack([
146
+ (pt9[2] + pt9[3]) / 2, # left eye
147
+ (pt9[0] + pt9[1]) / 2, # right eye
148
+ pt9[4],
149
+ (pt9[5] + pt9[6] ) / 2 # lip
150
+ ], axis=0)
151
+ pt2 = np.stack([
152
+ (pt9[0] + pt9[1]) / 2, # eye
153
+ pt9[3] # lip
154
+ ], axis=0)
155
+ else:
156
+ pt2 = np.stack([
157
+ (pt9[2] + pt9[3]) / 2,
158
+ (pt9[0] + pt9[1]) / 2,
159
+ ], axis=0)
160
+
161
+ return pt2
162
+
163
+ def parse_pt2_from_pt_x(pts, use_lip=True):
164
+ if pts.shape[0] == 101:
165
+ pt2 = parse_pt2_from_pt101(pts, use_lip=use_lip)
166
+ elif pts.shape[0] == 106:
167
+ pt2 = parse_pt2_from_pt106(pts, use_lip=use_lip)
168
+ elif pts.shape[0] == 68:
169
+ pt2 = parse_pt2_from_pt68(pts, use_lip=use_lip)
170
+ elif pts.shape[0] == 5:
171
+ pt2 = parse_pt2_from_pt5(pts, use_lip=use_lip)
172
+ elif pts.shape[0] == 203:
173
+ pt2 = parse_pt2_from_pt203(pts, use_lip=use_lip)
174
+ elif pts.shape[0] > 101:
175
+ # take the first 101 points
176
+ pt2 = parse_pt2_from_pt101(pts[:101], use_lip=use_lip)
177
+ elif pts.shape[0] == 9:
178
+ pt2 = parse_pt2_from_pt9(pts, use_lip=use_lip)
179
+ else:
180
+ raise Exception(f'Unknow shape: {pts.shape}')
181
+
182
+ if not use_lip:
183
+ # NOTE: to compile with the latter code, need to rotate the pt2 90 degrees clockwise manually
184
+ v = pt2[1] - pt2[0]
185
+ pt2[1, 0] = pt2[0, 0] - v[1]
186
+ pt2[1, 1] = pt2[0, 1] + v[0]
187
+
188
+ return pt2
189
+
190
+
191
+ def parse_rect_from_landmark(
192
+ pts,
193
+ scale=1.5,
194
+ need_square=True,
195
+ vx_ratio=0,
196
+ vy_ratio=0,
197
+ use_deg_flag=False,
198
+ **kwargs
199
+ ):
200
+ """parsing center, size, angle from 101/68/5/x landmarks
201
+ vx_ratio: the offset ratio along the pupil axis x-axis, multiplied by size
202
+ vy_ratio: the offset ratio along the pupil axis y-axis, multiplied by size, which is used to contain more forehead area
203
+
204
+ judge with pts.shape
205
+ """
206
+ pt2 = parse_pt2_from_pt_x(pts, use_lip=kwargs.get('use_lip', True))
207
+
208
+ uy = pt2[1] - pt2[0]
209
+ l = np.linalg.norm(uy)
210
+ if l <= 1e-3:
211
+ uy = np.array([0, 1], dtype=DTYPE)
212
+ else:
213
+ uy /= l
214
+ ux = np.array((uy[1], -uy[0]), dtype=DTYPE)
215
+
216
+ # the rotation degree of the x-axis, the clockwise is positive, the counterclockwise is negative (image coordinate system)
217
+ # print(uy)
218
+ # print(ux)
219
+ angle = acos(ux[0])
220
+ if ux[1] < 0:
221
+ angle = -angle
222
+
223
+ # rotation matrix
224
+ M = np.array([ux, uy])
225
+
226
+ # calculate the size which contains the angle degree of the bbox, and the center
227
+ center0 = np.mean(pts, axis=0)
228
+ rpts = (pts - center0) @ M.T # (M @ P.T).T = P @ M.T
229
+ lt_pt = np.min(rpts, axis=0)
230
+ rb_pt = np.max(rpts, axis=0)
231
+ center1 = (lt_pt + rb_pt) / 2
232
+
233
+ size = rb_pt - lt_pt
234
+ if need_square:
235
+ m = max(size[0], size[1])
236
+ size[0] = m
237
+ size[1] = m
238
+
239
+ size *= scale # scale size
240
+ center = center0 + ux * center1[0] + uy * center1[1] # counterclockwise rotation, equivalent to M.T @ center1.T
241
+ center = center + ux * (vx_ratio * size) + uy * \
242
+ (vy_ratio * size) # considering the offset in vx and vy direction
243
+
244
+ if use_deg_flag:
245
+ angle = degrees(angle)
246
+
247
+ return center, size, angle
248
+
249
+
250
+ def parse_bbox_from_landmark(pts, **kwargs):
251
+ center, size, angle = parse_rect_from_landmark(pts, **kwargs)
252
+ cx, cy = center
253
+ w, h = size
254
+
255
+ # calculate the vertex positions before rotation
256
+ bbox = np.array([
257
+ [cx-w/2, cy-h/2], # left, top
258
+ [cx+w/2, cy-h/2],
259
+ [cx+w/2, cy+h/2], # right, bottom
260
+ [cx-w/2, cy+h/2]
261
+ ], dtype=DTYPE)
262
+
263
+ # construct rotation matrix
264
+ bbox_rot = bbox.copy()
265
+ R = np.array([
266
+ [np.cos(angle), -np.sin(angle)],
267
+ [np.sin(angle), np.cos(angle)]
268
+ ], dtype=DTYPE)
269
+
270
+ # calculate the relative position of each vertex from the rotation center, then rotate these positions, and finally add the coordinates of the rotation center
271
+ bbox_rot = (bbox_rot - center) @ R.T + center
272
+
273
+ return {
274
+ 'center': center, # 2x1
275
+ 'size': size, # scalar
276
+ 'angle': angle, # rad, counterclockwise
277
+ 'bbox': bbox, # 4x2
278
+ 'bbox_rot': bbox_rot, # 4x2
279
+ }
280
+
281
+
282
+ def crop_image_by_bbox(img, bbox, lmk=None, dsize=512, angle=None, flag_rot=False, **kwargs):
283
+ left, top, right, bot = bbox
284
+ if int(right - left) != int(bot - top):
285
+ print(f'right-left {right-left} != bot-top {bot-top}')
286
+ size = right - left
287
+
288
+ src_center = np.array([(left + right) / 2, (top + bot) / 2], dtype=DTYPE)
289
+ tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE)
290
+
291
+ s = dsize / size # scale
292
+ if flag_rot and angle is not None:
293
+ costheta, sintheta = cos(angle), sin(angle)
294
+ cx, cy = src_center[0], src_center[1] # ori center
295
+ tcx, tcy = tgt_center[0], tgt_center[1] # target center
296
+ # need to infer
297
+ M_o2c = np.array(
298
+ [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
299
+ [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
300
+ dtype=DTYPE
301
+ )
302
+ else:
303
+ M_o2c = np.array(
304
+ [[s, 0, tgt_center[0] - s * src_center[0]],
305
+ [0, s, tgt_center[1] - s * src_center[1]]],
306
+ dtype=DTYPE
307
+ )
308
+
309
+ # if flag_rot and angle is None:
310
+ # print('angle is None, but flag_rotate is True', style="bold yellow")
311
+
312
+ img_crop = _transform_img(img, M_o2c, dsize=dsize, borderMode=kwargs.get('borderMode', None))
313
+ lmk_crop = _transform_pts(lmk, M_o2c) if lmk is not None else None
314
+
315
+ M_o2c = np.vstack([M_o2c, np.array([0, 0, 1], dtype=DTYPE)])
316
+ M_c2o = np.linalg.inv(M_o2c)
317
+
318
+ # cv2.imwrite('crop.jpg', img_crop)
319
+
320
+ return {
321
+ 'img_crop': img_crop,
322
+ 'lmk_crop': lmk_crop,
323
+ 'M_o2c': M_o2c,
324
+ 'M_c2o': M_c2o,
325
+ }
326
+
327
+
328
+ def _estimate_similar_transform_from_pts(
329
+ pts,
330
+ dsize,
331
+ scale=1.5,
332
+ vx_ratio=0,
333
+ vy_ratio=-0.1,
334
+ flag_do_rot=True,
335
+ **kwargs
336
+ ):
337
+ """ calculate the affine matrix of the cropped image from sparse points, the original image to the cropped image, the inverse is the cropped image to the original image
338
+ pts: landmark, 101 or 68 points or other points, Nx2
339
+ scale: the larger scale factor, the smaller face ratio
340
+ vx_ratio: x shift
341
+ vy_ratio: y shift, the smaller the y shift, the lower the face region
342
+ rot_flag: if it is true, conduct correction
343
+ """
344
+ center, size, angle = parse_rect_from_landmark(
345
+ pts, scale=scale, vx_ratio=vx_ratio, vy_ratio=vy_ratio,
346
+ use_lip=kwargs.get('use_lip', True)
347
+ )
348
+
349
+ s = dsize / size[0] # scale
350
+ tgt_center = np.array([dsize / 2, dsize / 2], dtype=DTYPE) # center of dsize
351
+
352
+ if flag_do_rot:
353
+ costheta, sintheta = cos(angle), sin(angle)
354
+ cx, cy = center[0], center[1] # ori center
355
+ tcx, tcy = tgt_center[0], tgt_center[1] # target center
356
+ # need to infer
357
+ M_INV = np.array(
358
+ [[s * costheta, s * sintheta, tcx - s * (costheta * cx + sintheta * cy)],
359
+ [-s * sintheta, s * costheta, tcy - s * (-sintheta * cx + costheta * cy)]],
360
+ dtype=DTYPE
361
+ )
362
+ else:
363
+ M_INV = np.array(
364
+ [[s, 0, tgt_center[0] - s * center[0]],
365
+ [0, s, tgt_center[1] - s * center[1]]],
366
+ dtype=DTYPE
367
+ )
368
+
369
+ M_INV_H = np.vstack([M_INV, np.array([0, 0, 1])])
370
+ M = np.linalg.inv(M_INV_H)
371
+
372
+ # M_INV is from the original image to the cropped image, M is from the cropped image to the original image
373
+ return M_INV, M[:2, ...]
374
+
375
+
376
+ def crop_image(img, pts: np.ndarray, **kwargs):
377
+ dsize = kwargs.get('dsize', 224)
378
+ scale = kwargs.get('scale', 1.5) # 1.5 | 1.6
379
+ vy_ratio = kwargs.get('vy_ratio', -0.1) # -0.0625 | -0.1
380
+
381
+ M_INV, _ = _estimate_similar_transform_from_pts(
382
+ pts,
383
+ dsize=dsize,
384
+ scale=scale,
385
+ vy_ratio=vy_ratio,
386
+ flag_do_rot=kwargs.get('flag_do_rot', True),
387
+ )
388
+
389
+ img_crop = _transform_img(img, M_INV, dsize) # origin to crop
390
+ pt_crop = _transform_pts(pts, M_INV)
391
+
392
+ M_o2c = np.vstack([M_INV, np.array([0, 0, 1], dtype=DTYPE)])
393
+ M_c2o = np.linalg.inv(M_o2c)
394
+
395
+ ret_dct = {
396
+ 'M_o2c': M_o2c, # from the original image to the cropped image 3x3
397
+ 'M_c2o': M_c2o, # from the cropped image to the original image 3x3
398
+ 'img_crop': img_crop, # the cropped image
399
+ 'pt_crop': pt_crop, # the landmarks of the cropped image
400
+ }
401
+
402
+ return ret_dct
403
+
404
+ def average_bbox_lst(bbox_lst):
405
+ if len(bbox_lst) == 0:
406
+ return None
407
+ bbox_arr = np.array(bbox_lst)
408
+ return np.mean(bbox_arr, axis=0).tolist()
409
+
410
+ def prepare_paste_back(mask_crop, crop_M_c2o, dsize):
411
+ """prepare mask for later image paste back
412
+ """
413
+ mask_ori = _transform_img(mask_crop, crop_M_c2o, dsize)
414
+ mask_ori = mask_ori.astype(np.float32) / 255.
415
+ return mask_ori
416
+
417
+ def paste_back(img_crop, M_c2o, img_ori, mask_ori):
418
+ """paste back the image
419
+ """
420
+ dsize = (img_ori.shape[1], img_ori.shape[0])
421
+ result = _transform_img(img_crop, M_c2o, dsize=dsize)
422
+ result = np.clip(mask_ori * result + (1 - mask_ori) * img_ori, 0, 255).astype(np.uint8)
423
+ return result
python/utils/dependencies/XPose/config_model/UniPose_SwinT.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ['coco_transformer.py']
2
+
3
+ use_label_enc = True
4
+
5
+ num_classes=2
6
+
7
+ lr = 0.0001
8
+ param_dict_type = 'default'
9
+ lr_backbone = 1e-05
10
+ lr_backbone_names = ['backbone.0']
11
+ lr_linear_proj_names = ['reference_points', 'sampling_offsets']
12
+ lr_linear_proj_mult = 0.1
13
+ ddetr_lr_param = False
14
+ batch_size = 2
15
+ weight_decay = 0.0001
16
+ epochs = 12
17
+ lr_drop = 11
18
+ save_checkpoint_interval = 100
19
+ clip_max_norm = 0.1
20
+ onecyclelr = False
21
+ multi_step_lr = False
22
+ lr_drop_list = [33, 45]
23
+
24
+
25
+ modelname = 'UniPose'
26
+ frozen_weights = None
27
+ backbone = 'swin_T_224_1k'
28
+
29
+
30
+ dilation = False
31
+ position_embedding = 'sine'
32
+ pe_temperatureH = 20
33
+ pe_temperatureW = 20
34
+ return_interm_indices = [1, 2, 3]
35
+ backbone_freeze_keywords = None
36
+ enc_layers = 6
37
+ dec_layers = 6
38
+ unic_layers = 0
39
+ pre_norm = False
40
+ dim_feedforward = 2048
41
+ hidden_dim = 256
42
+ dropout = 0.0
43
+ nheads = 8
44
+ num_queries = 900
45
+ query_dim = 4
46
+ num_patterns = 0
47
+ pdetr3_bbox_embed_diff_each_layer = False
48
+ pdetr3_refHW = -1
49
+ random_refpoints_xy = False
50
+ fix_refpoints_hw = -1
51
+ dabdetr_yolo_like_anchor_update = False
52
+ dabdetr_deformable_encoder = False
53
+ dabdetr_deformable_decoder = False
54
+ use_deformable_box_attn = False
55
+ box_attn_type = 'roi_align'
56
+ dec_layer_number = None
57
+ num_feature_levels = 4
58
+ enc_n_points = 4
59
+ dec_n_points = 4
60
+ decoder_layer_noise = False
61
+ dln_xy_noise = 0.2
62
+ dln_hw_noise = 0.2
63
+ add_channel_attention = False
64
+ add_pos_value = False
65
+ two_stage_type = 'standard'
66
+ two_stage_pat_embed = 0
67
+ two_stage_add_query_num = 0
68
+ two_stage_bbox_embed_share = False
69
+ two_stage_class_embed_share = False
70
+ two_stage_learn_wh = False
71
+ two_stage_default_hw = 0.05
72
+ two_stage_keep_all_tokens = False
73
+ num_select = 50
74
+ transformer_activation = 'relu'
75
+ batch_norm_type = 'FrozenBatchNorm2d'
76
+ masks = False
77
+
78
+ decoder_sa_type = 'sa' # ['sa', 'ca_label', 'ca_content']
79
+ matcher_type = 'HungarianMatcher' # or SimpleMinsumMatcher
80
+ decoder_module_seq = ['sa', 'ca', 'ffn']
81
+ nms_iou_threshold = -1
82
+
83
+ dec_pred_bbox_embed_share = True
84
+ dec_pred_class_embed_share = True
85
+
86
+
87
+ use_dn = True
88
+ dn_number = 100
89
+ dn_box_noise_scale = 1.0
90
+ dn_label_noise_ratio = 0.5
91
+ dn_label_coef=1.0
92
+ dn_bbox_coef=1.0
93
+ embed_init_tgt = True
94
+ dn_labelbook_size = 2000
95
+
96
+ match_unstable_error = True
97
+
98
+ # for ema
99
+ use_ema = True
100
+ ema_decay = 0.9997
101
+ ema_epoch = 0
102
+
103
+ use_detached_boxes_dec_out = False
104
+
105
+ max_text_len = 256
106
+ shuffle_type = None
107
+
108
+ use_text_enhancer = True
109
+ use_fusion_layer = True
110
+
111
+ use_checkpoint = False # True
112
+ use_transformer_ckpt = True
113
+ text_encoder_type = 'bert-base-uncased'
114
+
115
+ use_text_cross_attention = True
116
+ text_dropout = 0.0
117
+ fusion_dropout = 0.0
118
+ fusion_droppath = 0.1
119
+
120
+ num_body_points=68
121
+ binary_query_selection = False
122
+ use_cdn = True
123
+ ffn_extra_layernorm = False
124
+
125
+ fix_size=False
python/utils/dependencies/XPose/config_model/coco_transformer.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
2
+ data_aug_max_size = 1333
3
+ data_aug_scales2_resize = [400, 500, 600]
4
+ data_aug_scales2_crop = [384, 600]
5
+
6
+
7
+ data_aug_scale_overlap = None
8
+
python/utils/dependencies/XPose/models/UniPose/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Conditional DETR
3
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Copied from DETR (https://github.com/facebookresearch/detr)
7
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
8
+ # ------------------------------------------------------------------------
9
+
10
+ from .unipose import build_unipose
python/utils/dependencies/XPose/models/UniPose/attention.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # UniPose
3
+ # url: https://github.com/IDEA-Research/UniPose
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # ED-Pose
8
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
9
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
+ # ------------------------------------------------------------------------
11
+ # Conditional DETR
12
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
13
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
14
+ # ------------------------------------------------------------------------
15
+ # Modified from codes in torch.nn
16
+ # ------------------------------------------------------------------------
17
+
18
+ """
19
+ MultiheadAttention that support query, key, and value to have different dimensions.
20
+ Query, key, and value projections are removed.
21
+
22
+ Mostly copy-paste from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L873
23
+ and https://github.com/pytorch/pytorch/blob/master/torch/nn/functional.py#L4837
24
+ """
25
+
26
+ import warnings
27
+ import torch
28
+ from torch.nn.modules.linear import Linear
29
+ from torch.nn.init import constant_
30
+ from torch.nn.modules.module import Module
31
+ from torch._jit_internal import Optional, Tuple
32
+ try:
33
+ from torch.overrides import has_torch_function, handle_torch_function
34
+ except:
35
+ from torch._overrides import has_torch_function, handle_torch_function
36
+ from torch.nn.functional import linear, pad, softmax, dropout
37
+ Tensor = torch.Tensor
38
+
39
+ class MultiheadAttention(Module):
40
+ r"""Allows the model to jointly attend to information
41
+ from different representation subspaces.
42
+ See reference: Attention Is All You Need
43
+ .. math::
44
+ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
45
+ \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
46
+ Args:
47
+ embed_dim: total dimension of the model.
48
+ num_heads: parallel attention heads.
49
+ dropout: a Dropout layer on attn_output_weights. Default: 0.0.
50
+ bias: add bias as module parameter. Default: True.
51
+ add_bias_kv: add bias to the key and value sequences at dim=0.
52
+ add_zero_attn: add a new batch of zeros to the key and
53
+ value sequences at dim=1.
54
+ kdim: total number of features in key. Default: None.
55
+ vdim: total number of features in value. Default: None.
56
+ Note: if kdim and vdim are None, they will be set to embed_dim such that
57
+ query, key, and value have the same number of features.
58
+ Examples::
59
+ >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
60
+ >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
61
+ """
62
+ bias_k: Optional[torch.Tensor]
63
+ bias_v: Optional[torch.Tensor]
64
+
65
+ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
66
+ super(MultiheadAttention, self).__init__()
67
+ self.embed_dim = embed_dim
68
+ self.kdim = kdim if kdim is not None else embed_dim
69
+ self.vdim = vdim if vdim is not None else embed_dim
70
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
71
+
72
+ self.num_heads = num_heads
73
+ self.dropout = dropout
74
+ self.head_dim = embed_dim // num_heads
75
+ assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
76
+
77
+ vdim = vdim if vdim is not None else embed_dim
78
+ self.out_proj = Linear(vdim , vdim)
79
+
80
+ self.in_proj_bias = None
81
+ self.in_proj_weight = None
82
+ self.bias_k = self.bias_v = None
83
+ self.q_proj_weight = None
84
+ self.k_proj_weight = None
85
+ self.v_proj_weight = None
86
+
87
+ self.add_zero_attn = add_zero_attn
88
+
89
+ self._reset_parameters()
90
+
91
+ def _reset_parameters(self):
92
+ constant_(self.out_proj.bias, 0.)
93
+
94
+ def __setstate__(self, state):
95
+ # Support loading old MultiheadAttention checkpoints generated by v1.1.0
96
+ if '_qkv_same_embed_dim' not in state:
97
+ state['_qkv_same_embed_dim'] = True
98
+
99
+ super(MultiheadAttention, self).__setstate__(state)
100
+
101
+ def forward(self, query, key, value, key_padding_mask=None,
102
+ need_weights=True, attn_mask=None):
103
+ # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
104
+ r"""
105
+ Args:
106
+ query, key, value: map a query and a set of key-value pairs to an output.
107
+ See "Attention Is All You Need" for more details.
108
+ key_padding_mask: if provided, specified padding elements in the key will
109
+ be ignored by the attention. When given a binary mask and a value is True,
110
+ the corresponding value on the attention layer will be ignored. When given
111
+ a byte mask and a value is non-zero, the corresponding value on the attention
112
+ layer will be ignored
113
+ need_weights: output attn_output_weights.
114
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
115
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
116
+ Shape:
117
+ - Inputs:
118
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
119
+ the embedding dimension.
120
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
121
+ the embedding dimension.
122
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
123
+ the embedding dimension.
124
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
125
+ If a ByteTensor is provided, the non-zero positions will be ignored while the position
126
+ with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
127
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
128
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
129
+ 3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
130
+ S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
131
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
132
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
133
+ is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
134
+ is provided, it will be added to the attention weight.
135
+ - Outputs:
136
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
137
+ E is the embedding dimension.
138
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
139
+ L is the target sequence length, S is the source sequence length.
140
+ """
141
+ if not self._qkv_same_embed_dim:
142
+ return multi_head_attention_forward(
143
+ query, key, value, self.embed_dim, self.num_heads,
144
+ self.in_proj_weight, self.in_proj_bias,
145
+ self.bias_k, self.bias_v, self.add_zero_attn,
146
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
147
+ training=self.training,
148
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
149
+ attn_mask=attn_mask, use_separate_proj_weight=True,
150
+ q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
151
+ v_proj_weight=self.v_proj_weight, out_dim=self.vdim)
152
+ else:
153
+ return multi_head_attention_forward(
154
+ query, key, value, self.embed_dim, self.num_heads,
155
+ self.in_proj_weight, self.in_proj_bias,
156
+ self.bias_k, self.bias_v, self.add_zero_attn,
157
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
158
+ training=self.training,
159
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
160
+ attn_mask=attn_mask, out_dim=self.vdim)
161
+
162
+
163
+ def multi_head_attention_forward(query: Tensor,
164
+ key: Tensor,
165
+ value: Tensor,
166
+ embed_dim_to_check: int,
167
+ num_heads: int,
168
+ in_proj_weight: Tensor,
169
+ in_proj_bias: Tensor,
170
+ bias_k: Optional[Tensor],
171
+ bias_v: Optional[Tensor],
172
+ add_zero_attn: bool,
173
+ dropout_p: float,
174
+ out_proj_weight: Tensor,
175
+ out_proj_bias: Tensor,
176
+ training: bool = True,
177
+ key_padding_mask: Optional[Tensor] = None,
178
+ need_weights: bool = True,
179
+ attn_mask: Optional[Tensor] = None,
180
+ use_separate_proj_weight: bool = False,
181
+ q_proj_weight: Optional[Tensor] = None,
182
+ k_proj_weight: Optional[Tensor] = None,
183
+ v_proj_weight: Optional[Tensor] = None,
184
+ static_k: Optional[Tensor] = None,
185
+ static_v: Optional[Tensor] = None,
186
+ out_dim: Optional[Tensor] = None
187
+ ) -> Tuple[Tensor, Optional[Tensor]]:
188
+ r"""
189
+ Args:
190
+ query, key, value: map a query and a set of key-value pairs to an output.
191
+ See "Attention Is All You Need" for more details.
192
+ embed_dim_to_check: total dimension of the model.
193
+ num_heads: parallel attention heads.
194
+ in_proj_weight, in_proj_bias: input projection weight and bias.
195
+ bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
196
+ add_zero_attn: add a new batch of zeros to the key and
197
+ value sequences at dim=1.
198
+ dropout_p: probability of an element to be zeroed.
199
+ out_proj_weight, out_proj_bias: the output projection weight and bias.
200
+ training: apply dropout if is ``True``.
201
+ key_padding_mask: if provided, specified padding elements in the key will
202
+ be ignored by the attention. This is an binary mask. When the value is True,
203
+ the corresponding value on the attention layer will be filled with -inf.
204
+ need_weights: output attn_output_weights.
205
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
206
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
207
+ use_separate_proj_weight: the function accept the proj. weights for query, key,
208
+ and value in different forms. If false, in_proj_weight will be used, which is
209
+ a combination of q_proj_weight, k_proj_weight, v_proj_weight.
210
+ q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
211
+ static_k, static_v: static key and value used for attention operators.
212
+ Shape:
213
+ Inputs:
214
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
215
+ the embedding dimension.
216
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
217
+ the embedding dimension.
218
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
219
+ the embedding dimension.
220
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
221
+ If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
222
+ will be unchanged. If a BoolTensor is provided, the positions with the
223
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
224
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
225
+ 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
226
+ S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
227
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
228
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
229
+ are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
230
+ is provided, it will be added to the attention weight.
231
+ - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
232
+ N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
233
+ - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
234
+ N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
235
+ Outputs:
236
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
237
+ E is the embedding dimension.
238
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
239
+ L is the target sequence length, S is the source sequence length.
240
+ """
241
+ if not torch.jit.is_scripting():
242
+ tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
243
+ out_proj_weight, out_proj_bias)
244
+ if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
245
+ return handle_torch_function(
246
+ multi_head_attention_forward, tens_ops, query, key, value,
247
+ embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
248
+ bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
249
+ out_proj_bias, training=training, key_padding_mask=key_padding_mask,
250
+ need_weights=need_weights, attn_mask=attn_mask,
251
+ use_separate_proj_weight=use_separate_proj_weight,
252
+ q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
253
+ v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
254
+ tgt_len, bsz, embed_dim = query.size()
255
+ assert embed_dim == embed_dim_to_check
256
+ # allow MHA to have different sizes for the feature dimension
257
+ assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
258
+
259
+ head_dim = embed_dim // num_heads
260
+ v_head_dim = out_dim // num_heads
261
+ assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
262
+ scaling = float(head_dim) ** -0.5
263
+
264
+ q = query * scaling
265
+ k = key
266
+ v = value
267
+
268
+ if attn_mask is not None:
269
+ assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
270
+ attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
271
+ 'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
272
+ if attn_mask.dtype == torch.uint8:
273
+ warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
274
+ attn_mask = attn_mask.to(torch.bool)
275
+
276
+ if attn_mask.dim() == 2:
277
+ attn_mask = attn_mask.unsqueeze(0)
278
+ if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
279
+ raise RuntimeError('The size of the 2D attn_mask is not correct.')
280
+ elif attn_mask.dim() == 3:
281
+ if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
282
+ raise RuntimeError('The size of the 3D attn_mask is not correct.')
283
+ else:
284
+ raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
285
+ # attn_mask's dim is 3 now.
286
+
287
+ # convert ByteTensor key_padding_mask to bool
288
+ if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
289
+ warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
290
+ key_padding_mask = key_padding_mask.to(torch.bool)
291
+
292
+ if bias_k is not None and bias_v is not None:
293
+ if static_k is None and static_v is None:
294
+ k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
295
+ v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
296
+ if attn_mask is not None:
297
+ attn_mask = pad(attn_mask, (0, 1))
298
+ if key_padding_mask is not None:
299
+ key_padding_mask = pad(key_padding_mask, (0, 1))
300
+ else:
301
+ assert static_k is None, "bias cannot be added to static key."
302
+ assert static_v is None, "bias cannot be added to static value."
303
+ else:
304
+ assert bias_k is None
305
+ assert bias_v is None
306
+
307
+ q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
308
+ if k is not None:
309
+ k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
310
+ if v is not None:
311
+ v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
312
+
313
+ if static_k is not None:
314
+ assert static_k.size(0) == bsz * num_heads
315
+ assert static_k.size(2) == head_dim
316
+ k = static_k
317
+
318
+ if static_v is not None:
319
+ assert static_v.size(0) == bsz * num_heads
320
+ assert static_v.size(2) == v_head_dim
321
+ v = static_v
322
+
323
+ src_len = k.size(1)
324
+
325
+ if key_padding_mask is not None:
326
+ assert key_padding_mask.size(0) == bsz
327
+ assert key_padding_mask.size(1) == src_len
328
+
329
+ if add_zero_attn:
330
+ src_len += 1
331
+ k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
332
+ v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
333
+ if attn_mask is not None:
334
+ attn_mask = pad(attn_mask, (0, 1))
335
+ if key_padding_mask is not None:
336
+ key_padding_mask = pad(key_padding_mask, (0, 1))
337
+
338
+ attn_output_weights = torch.bmm(q, k.transpose(1, 2))
339
+ assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
340
+
341
+ if attn_mask is not None:
342
+ if attn_mask.dtype == torch.bool:
343
+ attn_output_weights.masked_fill_(attn_mask, float('-inf'))
344
+ else:
345
+ attn_output_weights += attn_mask
346
+
347
+
348
+ if key_padding_mask is not None:
349
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
350
+ attn_output_weights = attn_output_weights.masked_fill(
351
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
352
+ float('-inf'),
353
+ )
354
+ attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
355
+
356
+ # attn_output_weights = softmax(
357
+ # attn_output_weights, dim=-1)
358
+ attn_output_weights = softmax(
359
+ attn_output_weights - attn_output_weights.max(dim=-1, keepdim=True)[0], dim=-1)
360
+ attn_output_weights = dropout(attn_output_weights, p=dropout_p, training=training)
361
+
362
+ attn_output = torch.bmm(attn_output_weights, v)
363
+ assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
364
+ attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
365
+ attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
366
+
367
+ if need_weights:
368
+ # average attention weights over heads
369
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
370
+ return attn_output, attn_output_weights.sum(dim=1) / num_heads
371
+ else:
372
+ return attn_output, None
373
+
python/utils/dependencies/XPose/models/UniPose/backbone.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # UniPose
3
+ # url: https://github.com/IDEA-Research/UniPose
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # Conditional DETR
8
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
9
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
+ # ------------------------------------------------------------------------
11
+ # Copied from DETR (https://github.com/facebookresearch/detr)
12
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
13
+ # ------------------------------------------------------------------------
14
+
15
+ """
16
+ Backbone modules.
17
+ """
18
+
19
+ import torch
20
+ import torch.nn.functional as F
21
+ import torchvision
22
+ from torch import nn
23
+ from torchvision.models._utils import IntermediateLayerGetter
24
+ from typing import Dict, List
25
+
26
+ from util.misc import NestedTensor, is_main_process
27
+
28
+ from .position_encoding import build_position_encoding
29
+ from .swin_transformer import build_swin_transformer
30
+
31
+ class FrozenBatchNorm2d(torch.nn.Module):
32
+ """
33
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
34
+
35
+ Copy-paste from torchvision.misc.ops with added eps before rqsrt,
36
+ without which any other models than torchvision.models.resnet[18,34,50,101]
37
+ produce nans.
38
+ """
39
+
40
+ def __init__(self, n):
41
+ super(FrozenBatchNorm2d, self).__init__()
42
+ self.register_buffer("weight", torch.ones(n))
43
+ self.register_buffer("bias", torch.zeros(n))
44
+ self.register_buffer("running_mean", torch.zeros(n))
45
+ self.register_buffer("running_var", torch.ones(n))
46
+
47
+ def _load_from_state_dict(
48
+ self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
49
+ ):
50
+ num_batches_tracked_key = prefix + "num_batches_tracked"
51
+ if num_batches_tracked_key in state_dict:
52
+ del state_dict[num_batches_tracked_key]
53
+
54
+ super(FrozenBatchNorm2d, self)._load_from_state_dict(
55
+ state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
56
+ )
57
+
58
+ def forward(self, x):
59
+ # move reshapes to the beginning
60
+ # to make it fuser-friendly
61
+ w = self.weight.reshape(1, -1, 1, 1)
62
+ b = self.bias.reshape(1, -1, 1, 1)
63
+ rv = self.running_var.reshape(1, -1, 1, 1)
64
+ rm = self.running_mean.reshape(1, -1, 1, 1)
65
+ eps = 1e-5
66
+ scale = w * (rv + eps).rsqrt()
67
+ bias = b - rm * scale
68
+ return x * scale + bias
69
+
70
+
71
+ class BackboneBase(nn.Module):
72
+ def __init__(
73
+ self,
74
+ backbone: nn.Module,
75
+ train_backbone: bool,
76
+ num_channels: int,
77
+ return_interm_indices: list,
78
+ ):
79
+ super().__init__()
80
+ for name, parameter in backbone.named_parameters():
81
+ if (
82
+ not train_backbone
83
+ or "layer2" not in name
84
+ and "layer3" not in name
85
+ and "layer4" not in name
86
+ ):
87
+ parameter.requires_grad_(False)
88
+
89
+ return_layers = {}
90
+ for idx, layer_index in enumerate(return_interm_indices):
91
+ return_layers.update(
92
+ {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
93
+ )
94
+
95
+ self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
96
+ self.num_channels = num_channels
97
+
98
+ def forward(self, tensor_list: NestedTensor):
99
+ xs = self.body(tensor_list.tensors)
100
+ out: Dict[str, NestedTensor] = {}
101
+ for name, x in xs.items():
102
+ m = tensor_list.mask
103
+ assert m is not None
104
+ mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
105
+ out[name] = NestedTensor(x, mask)
106
+ # import ipdb; ipdb.set_trace()
107
+ return out
108
+
109
+
110
+ class Backbone(BackboneBase):
111
+ """ResNet backbone with frozen BatchNorm."""
112
+
113
+ def __init__(
114
+ self,
115
+ name: str,
116
+ train_backbone: bool,
117
+ dilation: bool,
118
+ return_interm_indices: list,
119
+ batch_norm=FrozenBatchNorm2d,
120
+ ):
121
+ if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
122
+ backbone = getattr(torchvision.models, name)(
123
+ replace_stride_with_dilation=[False, False, dilation],
124
+ pretrained=is_main_process(),
125
+ norm_layer=batch_norm,
126
+ )
127
+ else:
128
+ raise NotImplementedError("Why you can get here with name {}".format(name))
129
+ # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
130
+ assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
131
+ assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
132
+ num_channels_all = [256, 512, 1024, 2048]
133
+ num_channels = num_channels_all[4 - len(return_interm_indices) :]
134
+ super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
135
+
136
+
137
+ class Joiner(nn.Sequential):
138
+ def __init__(self, backbone, position_embedding):
139
+ super().__init__(backbone, position_embedding)
140
+
141
+ def forward(self, tensor_list: NestedTensor):
142
+ xs = self[0](tensor_list)
143
+ out: List[NestedTensor] = []
144
+ pos = []
145
+ for name, x in xs.items():
146
+ out.append(x)
147
+ # position encoding
148
+ pos.append(self[1](x).to(x.tensors.dtype))
149
+
150
+ return out, pos
151
+
152
+
153
+ def build_backbone(args):
154
+ """
155
+ Useful args:
156
+ - backbone: backbone name
157
+ - lr_backbone:
158
+ - dilation
159
+ - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
160
+ - backbone_freeze_keywords:
161
+ - use_checkpoint: for swin only for now
162
+
163
+ """
164
+ position_embedding = build_position_encoding(args)
165
+ train_backbone = True
166
+ if not train_backbone:
167
+ raise ValueError("Please set lr_backbone > 0")
168
+ return_interm_indices = args.return_interm_indices
169
+ assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
170
+ args.backbone_freeze_keywords
171
+ use_checkpoint = getattr(args, "use_checkpoint", False)
172
+
173
+ if args.backbone in ["resnet50", "resnet101"]:
174
+ backbone = Backbone(
175
+ args.backbone,
176
+ train_backbone,
177
+ args.dilation,
178
+ return_interm_indices,
179
+ batch_norm=FrozenBatchNorm2d,
180
+ )
181
+ bb_num_channels = backbone.num_channels
182
+ elif args.backbone in [
183
+ "swin_T_224_1k",
184
+ "swin_B_224_22k",
185
+ "swin_B_384_22k",
186
+ "swin_L_224_22k",
187
+ "swin_L_384_22k",
188
+ ]:
189
+ pretrain_img_size = int(args.backbone.split("_")[-2])
190
+ backbone = build_swin_transformer(
191
+ args.backbone,
192
+ pretrain_img_size=pretrain_img_size,
193
+ out_indices=tuple(return_interm_indices),
194
+ dilation=False,
195
+ use_checkpoint=use_checkpoint,
196
+ )
197
+
198
+ bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
199
+ else:
200
+ raise NotImplementedError("Unknown backbone {}".format(args.backbone))
201
+
202
+ assert len(bb_num_channels) == len(
203
+ return_interm_indices
204
+ ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
205
+
206
+ model = Joiner(backbone, position_embedding)
207
+ model.num_channels = bb_num_channels
208
+ assert isinstance(
209
+ bb_num_channels, List
210
+ ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
211
+ return model
python/utils/dependencies/XPose/models/UniPose/deformable_transformer.py ADDED
@@ -0,0 +1,1230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # UniPose
3
+ # url: https://github.com/IDEA-Research/UniPose
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+ # ED-Pose
8
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
9
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
+ # ------------------------------------------------------------------------
11
+ # DINO
12
+ # Copyright (c) 2022 IDEA. All Rights Reserved.
13
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
14
+ # ------------------------------------------------------------------------
15
+ # Modified from DETR (https://github.com/facebookresearch/detr)
16
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
17
+ # ------------------------------------------------------------------------
18
+
19
+ import math
20
+ import copy
21
+ import torch
22
+ import torch.utils.checkpoint as checkpoint
23
+ from torch import nn, Tensor
24
+ from typing import Optional
25
+ from util.misc import inverse_sigmoid
26
+
27
+ from .transformer_vanilla import TransformerEncoderLayer
28
+ from .fuse_modules import BiAttentionBlock
29
+ from .utils import gen_encoder_output_proposals, MLP, _get_activation_fn, gen_sineembed_for_position, get_sine_pos_embed
30
+ from .ops.modules import MSDeformAttn
31
+
32
+
33
+ class DeformableTransformer(nn.Module):
34
+
35
+ def __init__(self, d_model=256, nhead=8,
36
+ num_queries=300,
37
+ num_encoder_layers=6,
38
+ num_unicoder_layers=0,
39
+ num_decoder_layers=6,
40
+ dim_feedforward=2048, dropout=0.0,
41
+ activation="relu", normalize_before=False,
42
+ return_intermediate_dec=False, query_dim=4,
43
+ num_patterns=0,
44
+ modulate_hw_attn=False,
45
+ # for deformable encoder
46
+ deformable_encoder=False,
47
+ deformable_decoder=False,
48
+ num_feature_levels=1,
49
+ enc_n_points=4,
50
+ dec_n_points=4,
51
+ use_deformable_box_attn=False,
52
+ box_attn_type='roi_align',
53
+ # init query
54
+ learnable_tgt_init=False,
55
+ decoder_query_perturber=None,
56
+ add_channel_attention=False,
57
+ add_pos_value=False,
58
+ random_refpoints_xy=False,
59
+ # two stage
60
+ two_stage_type='no',
61
+ two_stage_pat_embed=0,
62
+ two_stage_add_query_num=0,
63
+ two_stage_learn_wh=False,
64
+ two_stage_keep_all_tokens=False,
65
+ # evo of #anchors
66
+ dec_layer_number=None,
67
+ rm_enc_query_scale=True,
68
+ rm_dec_query_scale=True,
69
+ rm_self_attn_layers=None,
70
+ key_aware_type=None,
71
+ # layer share
72
+ layer_share_type=None,
73
+ # for detach
74
+ rm_detach=None,
75
+ decoder_sa_type='ca',
76
+ module_seq=['sa', 'ca', 'ffn'],
77
+ # for dn
78
+ embed_init_tgt=False,
79
+
80
+ use_detached_boxes_dec_out=False,
81
+ use_text_enhancer=False,
82
+ use_fusion_layer=False,
83
+ use_checkpoint=False,
84
+ use_transformer_ckpt=False,
85
+ use_text_cross_attention=False,
86
+ text_dropout=0.1,
87
+ fusion_dropout=0.1,
88
+ fusion_droppath=0.0,
89
+
90
+ binary_query_selection=False,
91
+ ffn_extra_layernorm=False,
92
+ ):
93
+ super().__init__()
94
+ self.num_feature_levels = num_feature_levels
95
+ self.num_encoder_layers = num_encoder_layers
96
+ self.num_unicoder_layers = num_unicoder_layers
97
+ self.num_decoder_layers = num_decoder_layers
98
+ self.deformable_encoder = deformable_encoder
99
+ self.deformable_decoder = deformable_decoder
100
+ self.two_stage_keep_all_tokens = two_stage_keep_all_tokens
101
+ self.num_queries = num_queries
102
+ self.random_refpoints_xy = random_refpoints_xy
103
+ self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
104
+ self.ffn_extra_layernorm = ffn_extra_layernorm
105
+ assert query_dim == 4
106
+
107
+ self.binary_query_selection = binary_query_selection
108
+ if self.binary_query_selection:
109
+ self.binary_query_selection_layer = nn.Linear(d_model, 1)
110
+ # assert not binary_query_selection, 'binary_query_selection not implemented yet'
111
+
112
+ if num_feature_levels > 1:
113
+ assert deformable_encoder, "only support deformable_encoder for num_feature_levels > 1"
114
+ if use_deformable_box_attn:
115
+ assert deformable_encoder or deformable_encoder
116
+
117
+ assert layer_share_type in [None, 'encoder', 'decoder', 'both']
118
+ if layer_share_type in ['encoder', 'both']:
119
+ enc_layer_share = True
120
+ else:
121
+ enc_layer_share = False
122
+ if layer_share_type in ['decoder', 'both']:
123
+ dec_layer_share = True
124
+ else:
125
+ dec_layer_share = False
126
+ assert layer_share_type is None
127
+
128
+ self.decoder_sa_type = decoder_sa_type
129
+ assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
130
+
131
+ # choose encoder layer type
132
+ if deformable_encoder:
133
+ encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
134
+ dropout, activation,
135
+ num_feature_levels, nhead, enc_n_points,
136
+ add_channel_attention=add_channel_attention,
137
+ use_deformable_box_attn=use_deformable_box_attn,
138
+ box_attn_type=box_attn_type)
139
+ else:
140
+ raise NotImplementedError
141
+
142
+ if use_text_enhancer:
143
+ text_enhance_layer = TransformerEncoderLayer(
144
+ d_model=d_model,
145
+ nhead=nhead // 2,
146
+ dim_feedforward=dim_feedforward // 2,
147
+ dropout=text_dropout
148
+ )
149
+ else:
150
+ text_enhance_layer = None
151
+
152
+ if use_fusion_layer:
153
+ feature_fusion_layer = BiAttentionBlock(
154
+ v_dim=d_model,
155
+ l_dim=d_model,
156
+ embed_dim=dim_feedforward // 2,
157
+ num_heads=nhead // 2,
158
+ dropout=fusion_dropout,
159
+ drop_path=fusion_droppath
160
+ )
161
+ else:
162
+ feature_fusion_layer = None
163
+
164
+ encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
165
+ assert encoder_norm is None
166
+ self.encoder = TransformerEncoder(
167
+ encoder_layer, num_encoder_layers, d_model=d_model,
168
+ num_queries=num_queries,
169
+ enc_layer_share=enc_layer_share,
170
+ text_enhance_layer=text_enhance_layer,
171
+ feature_fusion_layer=feature_fusion_layer,
172
+ use_checkpoint=use_checkpoint,
173
+ use_transformer_ckpt=use_transformer_ckpt,
174
+ )
175
+
176
+ # choose decoder layer type
177
+ if deformable_decoder:
178
+ decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
179
+ dropout, activation,
180
+ num_feature_levels, nhead, dec_n_points,
181
+ use_text_cross_attention=use_text_cross_attention,
182
+ ffn_extra_layernorm=ffn_extra_layernorm, )
183
+
184
+ else:
185
+ raise NotImplementedError
186
+
187
+ decoder_norm = nn.LayerNorm(d_model)
188
+ self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
189
+ return_intermediate=return_intermediate_dec,
190
+ d_model=d_model, query_dim=query_dim,
191
+ modulate_hw_attn=modulate_hw_attn,
192
+ num_feature_levels=num_feature_levels,
193
+ deformable_decoder=deformable_decoder,
194
+ decoder_query_perturber=decoder_query_perturber,
195
+ dec_layer_number=dec_layer_number, rm_dec_query_scale=rm_dec_query_scale,
196
+ dec_layer_share=dec_layer_share,
197
+ use_detached_boxes_dec_out=use_detached_boxes_dec_out
198
+ )
199
+
200
+ self.d_model = d_model
201
+ self.nhead = nhead
202
+ self.dec_layers = num_decoder_layers
203
+ self.num_queries = num_queries # useful for single stage model only
204
+ self.num_patterns = num_patterns
205
+ if not isinstance(num_patterns, int):
206
+ Warning("num_patterns should be int but {}".format(type(num_patterns)))
207
+ self.num_patterns = 0
208
+
209
+ if num_feature_levels > 1:
210
+ if self.num_encoder_layers > 0:
211
+ self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
212
+ else:
213
+ self.level_embed = None
214
+
215
+ self.learnable_tgt_init = learnable_tgt_init
216
+ assert learnable_tgt_init, "why not learnable_tgt_init"
217
+ self.embed_init_tgt = embed_init_tgt
218
+ if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type == 'no'):
219
+ self.tgt_embed = nn.Embedding(self.num_queries, d_model)
220
+ nn.init.normal_(self.tgt_embed.weight.data)
221
+ else:
222
+ self.tgt_embed = None
223
+
224
+ # for two stage
225
+ self.two_stage_type = two_stage_type
226
+ self.two_stage_pat_embed = two_stage_pat_embed
227
+ self.two_stage_add_query_num = two_stage_add_query_num
228
+ self.two_stage_learn_wh = two_stage_learn_wh
229
+ assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
230
+ if two_stage_type == 'standard':
231
+ # anchor selection at the output of encoder
232
+ self.enc_output = nn.Linear(d_model, d_model)
233
+ self.enc_output_norm = nn.LayerNorm(d_model)
234
+
235
+ if two_stage_pat_embed > 0:
236
+ self.pat_embed_for_2stage = nn.Parameter(torch.Tensor(two_stage_pat_embed, d_model))
237
+ nn.init.normal_(self.pat_embed_for_2stage)
238
+
239
+ if two_stage_add_query_num > 0:
240
+ self.tgt_embed = nn.Embedding(self.two_stage_add_query_num, d_model)
241
+
242
+ if two_stage_learn_wh:
243
+ # import ipdb; ipdb.set_trace()
244
+ self.two_stage_wh_embedding = nn.Embedding(1, 2)
245
+ else:
246
+ self.two_stage_wh_embedding = None
247
+
248
+ if two_stage_type == 'no':
249
+ self.init_ref_points(num_queries) # init self.refpoint_embed
250
+
251
+ self.enc_out_class_embed = None
252
+ self.enc_out_bbox_embed = None
253
+
254
+ # evolution of anchors
255
+ self.dec_layer_number = dec_layer_number
256
+ if dec_layer_number is not None:
257
+ if self.two_stage_type != 'no' or num_patterns == 0:
258
+ assert dec_layer_number[
259
+ 0] == num_queries, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})"
260
+ else:
261
+ assert dec_layer_number[
262
+ 0] == num_queries * num_patterns, f"dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})"
263
+
264
+ self._reset_parameters()
265
+
266
+ self.rm_self_attn_layers = rm_self_attn_layers
267
+ if rm_self_attn_layers is not None:
268
+ # assert len(rm_self_attn_layers) == num_decoder_layers
269
+ print("Removing the self-attn in {} decoder layers".format(rm_self_attn_layers))
270
+ for lid, dec_layer in enumerate(self.decoder.layers):
271
+ if lid in rm_self_attn_layers:
272
+ dec_layer.rm_self_attn_modules()
273
+
274
+ self.rm_detach = rm_detach
275
+ if self.rm_detach:
276
+ assert isinstance(rm_detach, list)
277
+ assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
278
+ self.decoder.rm_detach = rm_detach
279
+
280
+ def _reset_parameters(self):
281
+ for p in self.parameters():
282
+ if p.dim() > 1:
283
+ nn.init.xavier_uniform_(p)
284
+ for m in self.modules():
285
+ if isinstance(m, MSDeformAttn):
286
+ m._reset_parameters()
287
+ if self.num_feature_levels > 1 and self.level_embed is not None:
288
+ nn.init.normal_(self.level_embed)
289
+
290
+ if self.two_stage_learn_wh:
291
+ nn.init.constant_(self.two_stage_wh_embedding.weight, math.log(0.05 / (1 - 0.05)))
292
+
293
+ def get_valid_ratio(self, mask):
294
+ _, H, W = mask.shape
295
+ valid_H = torch.sum(~mask[:, :, 0], 1)
296
+ valid_W = torch.sum(~mask[:, 0, :], 1)
297
+ valid_ratio_h = valid_H.float() / H
298
+ valid_ratio_w = valid_W.float() / W
299
+ valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
300
+ return valid_ratio
301
+
302
+ def init_ref_points(self, use_num_queries):
303
+ self.refpoint_embed = nn.Embedding(use_num_queries, 4)
304
+
305
+ if self.random_refpoints_xy:
306
+ # import ipdb; ipdb.set_trace()
307
+ self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
308
+ self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
309
+ self.refpoint_embed.weight.data[:, :2].requires_grad = False
310
+
311
+ def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, attn_mask2=None, text_dict=None,
312
+ dn_meta=None,targets=None,kpt_embed=None):
313
+ """
314
+ Input:
315
+ - srcs: List of multi features [bs, ci, hi, wi]
316
+ - masks: List of multi masks [bs, hi, wi]
317
+ - refpoint_embed: [bs, num_dn, 4]. None in infer
318
+ - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
319
+ - tgt: [bs, num_dn, d_model]. None in infer
320
+
321
+ """
322
+ # if self.two_stage_type != 'no' and self.two_stage_add_query_num == 0:
323
+ # assert refpoint_embed is None
324
+
325
+ # prepare input for encoder
326
+ src_flatten = []
327
+ mask_flatten = []
328
+ lvl_pos_embed_flatten = []
329
+ spatial_shapes = []
330
+ for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
331
+ bs, c, h, w = src.shape
332
+ spatial_shape = (h, w)
333
+ spatial_shapes.append(spatial_shape)
334
+
335
+ src = src.flatten(2).transpose(1, 2) # bs, hw, c
336
+ mask = mask.flatten(1) # bs, hw
337
+ pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c
338
+ if self.num_feature_levels > 1 and self.level_embed is not None:
339
+ lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
340
+ else:
341
+ lvl_pos_embed = pos_embed
342
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
343
+ src_flatten.append(src)
344
+ mask_flatten.append(mask)
345
+ src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c
346
+ mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw}
347
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c
348
+ spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
349
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
350
+ valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
351
+
352
+ # two stage
353
+ enc_topk_proposals = enc_refpoint_embed = None
354
+
355
+ #########################################################
356
+ # Begin Encoder
357
+ #########################################################
358
+ memory, memory_text = self.encoder(
359
+ src_flatten,
360
+ pos=lvl_pos_embed_flatten,
361
+ level_start_index=level_start_index,
362
+ spatial_shapes=spatial_shapes,
363
+ valid_ratios=valid_ratios,
364
+ key_padding_mask=mask_flatten,
365
+ memory_text=text_dict['encoded_text'],
366
+ text_attention_mask=~text_dict['text_token_mask'],
367
+ # we ~ the mask . False means use the token; True means pad the token
368
+ position_ids=text_dict['position_ids'],
369
+ text_self_attention_masks=text_dict['text_self_attention_masks'],
370
+ )
371
+ #########################################################
372
+ # End Encoder
373
+ # - memory: bs, \sum{hw}, c
374
+ # - mask_flatten: bs, \sum{hw}
375
+ # - lvl_pos_embed_flatten: bs, \sum{hw}, c
376
+ # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
377
+ # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
378
+ #########################################################
379
+ text_dict['encoded_text'] = memory_text
380
+
381
+ if self.two_stage_type == 'standard':
382
+ if self.two_stage_learn_wh:
383
+ input_hw = self.two_stage_wh_embedding.weight[0]
384
+ else:
385
+ input_hw = None
386
+ output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes,
387
+ input_hw)
388
+ output_memory = self.enc_output_norm(self.enc_output(output_memory))
389
+
390
+ if self.two_stage_pat_embed > 0:
391
+ bs, nhw, _ = output_memory.shape
392
+ # output_memory: bs, n, 256; self.pat_embed_for_2stage: k, 256
393
+ output_memory = output_memory.repeat(1, self.two_stage_pat_embed, 1)
394
+ _pats = self.pat_embed_for_2stage.repeat_interleave(nhw, 0)
395
+ output_memory = output_memory + _pats
396
+ output_proposals = output_proposals.repeat(1, self.two_stage_pat_embed, 1)
397
+
398
+ if self.two_stage_add_query_num > 0:
399
+ assert refpoint_embed is not None
400
+ output_memory = torch.cat((output_memory, tgt), dim=1)
401
+ output_proposals = torch.cat((output_proposals, refpoint_embed), dim=1)
402
+
403
+ if self.binary_query_selection:
404
+ topk_logits = self.binary_query_selection_layer(output_memory).squeeze(-1)
405
+ else:
406
+ if text_dict is not None:
407
+ enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
408
+ else:
409
+ enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
410
+
411
+ topk_logits = enc_outputs_class_unselected.max(-1)[0]
412
+ enc_outputs_coord_unselected = self.enc_out_bbox_embed(
413
+ output_memory) + output_proposals # (bs, \sum{hw}, 4) unsigmoid
414
+ topk = self.num_queries
415
+
416
+ topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] # bs, nq
417
+
418
+ # gather boxes
419
+ refpoint_embed_undetach = torch.gather(enc_outputs_coord_unselected, 1,
420
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) # unsigmoid
421
+ refpoint_embed_ = refpoint_embed_undetach.detach()
422
+ init_box_proposal = torch.gather(output_proposals, 1,
423
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4)).sigmoid() # sigmoid
424
+
425
+ # gather tgt
426
+ tgt_undetach = torch.gather(output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
427
+ if self.embed_init_tgt:
428
+ tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, d_model
429
+ else:
430
+ tgt_ = tgt_undetach.detach()
431
+
432
+ if refpoint_embed is not None:
433
+ refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
434
+ tgt = torch.cat([tgt, tgt_], dim=1)
435
+ else:
436
+ refpoint_embed, tgt = refpoint_embed_, tgt_
437
+
438
+ elif self.two_stage_type == 'no':
439
+ tgt_ = self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, d_model
440
+ refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) # nq, bs, 4
441
+
442
+ if refpoint_embed is not None:
443
+ refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
444
+ tgt = torch.cat([tgt, tgt_], dim=1)
445
+ else:
446
+ refpoint_embed, tgt = refpoint_embed_, tgt_
447
+
448
+ if self.num_patterns > 0:
449
+ tgt_embed = tgt.repeat(1, self.num_patterns, 1)
450
+ refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
451
+ tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(self.num_queries,
452
+ 1) # 1, n_q*n_pat, d_model
453
+ tgt = tgt_embed + tgt_pat
454
+
455
+ init_box_proposal = refpoint_embed_.sigmoid()
456
+
457
+ else:
458
+ raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
459
+ #########################################################
460
+ # End preparing tgt
461
+ # - tgt: bs, NQ, d_model
462
+ # - refpoint_embed(unsigmoid): bs, NQ, d_model
463
+ #########################################################
464
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
465
+ # if refpoint_embed.isnan().any() | refpoint_embed.isinf().any():
466
+ # import ipdb; ipdb.set_trace()
467
+ # if tgt.isnan().any() | tgt.isinf().any():
468
+ # import ipdb; ipdb.set_trace()
469
+
470
+ #########################################################
471
+ # Begin Decoder
472
+ #########################################################
473
+ hs, references = self.decoder(
474
+ tgt=tgt.transpose(0, 1),
475
+ memory=memory.transpose(0, 1),
476
+ memory_key_padding_mask=mask_flatten,
477
+ pos=lvl_pos_embed_flatten.transpose(0, 1),
478
+ refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
479
+ level_start_index=level_start_index,
480
+ spatial_shapes=spatial_shapes,
481
+ valid_ratios=valid_ratios, tgt_mask=attn_mask,
482
+ tgt_mask2=attn_mask2,
483
+ memory_text=text_dict['encoded_text'],
484
+ text_attention_mask=~text_dict['text_token_mask'],
485
+ text_dict=text_dict,
486
+ dn_meta=dn_meta,
487
+ targets=targets,
488
+ kpt_embed=kpt_embed
489
+ # we ~ the mask . False means use the token; True means pad the token
490
+ )
491
+ #########################################################
492
+ # End Decoder
493
+ # hs: n_dec, bs, nq, d_model
494
+ # references: n_dec+1, bs, nq, query_dim
495
+ #########################################################
496
+
497
+ #########################################################
498
+ # Begin postprocess
499
+ #########################################################
500
+ if self.two_stage_type == 'standard':
501
+ if self.two_stage_keep_all_tokens:
502
+ hs_enc = output_memory.unsqueeze(0)
503
+ ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
504
+ init_box_proposal = output_proposals
505
+ # import ipdb; ipdb.set_trace()
506
+ else:
507
+ hs_enc = tgt_undetach.unsqueeze(0)
508
+ ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
509
+ else:
510
+ hs_enc = ref_enc = None
511
+ #########################################################
512
+ # End postprocess
513
+ # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
514
+ # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
515
+ #########################################################
516
+
517
+ return hs, references, hs_enc, ref_enc, init_box_proposal
518
+ # hs: (n_dec, bs, nq, d_model)
519
+ # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
520
+ # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
521
+ # ref_enc: sigmoid coordinates. \
522
+ # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
523
+
524
+
525
+ class TransformerEncoder(nn.Module):
526
+
527
+ def __init__(self,
528
+ encoder_layer, num_layers, d_model=256,
529
+ num_queries=300,
530
+ enc_layer_share=False,
531
+ text_enhance_layer=None,
532
+ feature_fusion_layer=None,
533
+ use_checkpoint=False,
534
+ use_transformer_ckpt=False,
535
+ ):
536
+ """_summary_
537
+
538
+ Args:
539
+ encoder_layer (_type_): _description_
540
+ num_layers (_type_): _description_
541
+ norm (_type_, optional): _description_. Defaults to None.
542
+ d_model (int, optional): _description_. Defaults to 256.
543
+ num_queries (int, optional): _description_. Defaults to 300.
544
+ enc_layer_share (bool, optional): _description_. Defaults to False.
545
+
546
+ """
547
+ super().__init__()
548
+ # prepare layers
549
+ self.layers = []
550
+ self.text_layers = []
551
+ self.fusion_layers = []
552
+ if num_layers > 0:
553
+ self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
554
+
555
+ if text_enhance_layer is not None:
556
+ self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share)
557
+ if feature_fusion_layer is not None:
558
+ self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share)
559
+ else:
560
+ self.layers = []
561
+ del encoder_layer
562
+
563
+ if text_enhance_layer is not None:
564
+ self.text_layers = []
565
+ del text_enhance_layer
566
+ if feature_fusion_layer is not None:
567
+ self.fusion_layers = []
568
+ del feature_fusion_layer
569
+
570
+ self.query_scale = None
571
+ self.num_queries = num_queries
572
+ self.num_layers = num_layers
573
+ self.d_model = d_model
574
+
575
+ self.use_checkpoint = use_checkpoint
576
+ self.use_transformer_ckpt = use_transformer_ckpt
577
+
578
+ @staticmethod
579
+ def get_reference_points(spatial_shapes, valid_ratios, device):
580
+ reference_points_list = []
581
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
582
+ ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
583
+ torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),)
584
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
585
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
586
+ ref = torch.stack((ref_x, ref_y), -1)
587
+ reference_points_list.append(ref)
588
+ reference_points = torch.cat(reference_points_list, 1)
589
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
590
+ return reference_points
591
+
592
+ def forward(self,
593
+ # for images
594
+ src: Tensor,
595
+ pos: Tensor,
596
+ spatial_shapes: Tensor,
597
+ level_start_index: Tensor,
598
+ valid_ratios: Tensor,
599
+ key_padding_mask: Tensor,
600
+ # for texts
601
+ memory_text: Tensor = None,
602
+ text_attention_mask: Tensor = None,
603
+ pos_text: Tensor = None,
604
+ text_self_attention_masks: Tensor = None,
605
+ position_ids: Tensor = None,
606
+ ):
607
+ """
608
+ Input:
609
+ - src: [bs, sum(hi*wi), 256]
610
+ - pos: pos embed for src. [bs, sum(hi*wi), 256]
611
+ - spatial_shapes: h,w of each level [num_level, 2]
612
+ - level_start_index: [num_level] start point of level in sum(hi*wi).
613
+ - valid_ratios: [bs, num_level, 2]
614
+ - key_padding_mask: [bs, sum(hi*wi)]
615
+
616
+ - memory_text: bs, n_text, 256
617
+ - text_attention_mask: bs, n_text
618
+ False for no padding; True for padding
619
+ - pos_text: bs, n_text, 256
620
+
621
+ - position_ids: bs, n_text
622
+ Intermedia:
623
+ - reference_points: [bs, sum(hi*wi), num_level, 2]
624
+ Outpus:
625
+ - output: [bs, sum(hi*wi), 256]
626
+ """
627
+
628
+ output = src
629
+
630
+ # preparation and reshape
631
+ if self.num_layers > 0:
632
+ reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
633
+
634
+ if self.text_layers:
635
+ # generate pos_text
636
+ bs, n_text, text_dim = memory_text.shape
637
+ if pos_text is None and position_ids is None:
638
+ pos_text = torch.arange(n_text, device=memory_text.device).float().unsqueeze(0).unsqueeze(-1).repeat(bs,
639
+ 1,
640
+ 1)
641
+ pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
642
+ if position_ids is not None:
643
+ pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False)
644
+
645
+ # main process
646
+ for layer_id, layer in enumerate(self.layers):
647
+ # if output.isnan().any() or memory_text.isnan().any():
648
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
649
+ # import ipdb; ipdb.set_trace()
650
+ if self.fusion_layers:
651
+ if self.use_checkpoint:
652
+ output, memory_text = checkpoint.checkpoint(
653
+ self.fusion_layers[layer_id],
654
+ output,
655
+ memory_text,
656
+ key_padding_mask,
657
+ text_attention_mask
658
+ )
659
+ else:
660
+ output, memory_text = self.fusion_layers[layer_id](v=output, l=memory_text,
661
+ attention_mask_v=key_padding_mask,
662
+ attention_mask_l=text_attention_mask)
663
+
664
+ if self.text_layers:
665
+ memory_text = self.text_layers[layer_id](
666
+ src=memory_text.transpose(0, 1),
667
+ src_mask=~text_self_attention_masks, # note we use ~ for mask here
668
+ src_key_padding_mask=text_attention_mask,
669
+ pos=(pos_text.transpose(0, 1) if pos_text is not None else None)
670
+ ).transpose(0, 1)
671
+
672
+ # main process
673
+ if self.use_transformer_ckpt:
674
+ output = checkpoint.checkpoint(
675
+ layer,
676
+ output,
677
+ pos,
678
+ reference_points,
679
+ spatial_shapes,
680
+ level_start_index,
681
+ key_padding_mask
682
+ )
683
+ else:
684
+ output = layer(src=output, pos=pos, reference_points=reference_points, spatial_shapes=spatial_shapes,
685
+ level_start_index=level_start_index, key_padding_mask=key_padding_mask)
686
+
687
+ return output, memory_text
688
+
689
+
690
+ class TransformerDecoder(nn.Module):
691
+
692
+ def __init__(self, decoder_layer, num_layers, norm=None,
693
+ return_intermediate=False,
694
+ d_model=256, query_dim=4,
695
+ modulate_hw_attn=False,
696
+ num_feature_levels=1,
697
+ deformable_decoder=False,
698
+ decoder_query_perturber=None,
699
+ dec_layer_number=None, # number of queries each layer in decoder
700
+ rm_dec_query_scale=False,
701
+ dec_layer_share=False,
702
+ dec_layer_dropout_prob=None,
703
+ use_detached_boxes_dec_out=False,
704
+ num_box_decoder_layers=2,
705
+ num_body_points=68,
706
+ ):
707
+ super().__init__()
708
+ if num_layers > 0:
709
+ self.layers = _get_clones(decoder_layer, num_layers, layer_share=dec_layer_share)
710
+ else:
711
+ self.layers = []
712
+ self.num_layers = num_layers
713
+ self.norm = norm
714
+ self.return_intermediate = return_intermediate
715
+ assert return_intermediate, "support return_intermediate only"
716
+ self.query_dim = query_dim
717
+ assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
718
+ self.num_feature_levels = num_feature_levels
719
+ self.use_detached_boxes_dec_out = use_detached_boxes_dec_out
720
+
721
+ self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
722
+ if not deformable_decoder:
723
+ self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
724
+ else:
725
+ self.query_pos_sine_scale = None
726
+
727
+ if rm_dec_query_scale:
728
+ self.query_scale = None
729
+ else:
730
+ raise NotImplementedError
731
+ self.query_scale = MLP(d_model, d_model, d_model, 2)
732
+ self.bbox_embed = None
733
+ self.class_embed = None
734
+ self.pose_embed = None
735
+ self.pose_hw_embed = None
736
+ self.d_model = d_model
737
+ self.modulate_hw_attn = modulate_hw_attn
738
+ self.deformable_decoder = deformable_decoder
739
+
740
+ if not deformable_decoder and modulate_hw_attn:
741
+ self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
742
+ else:
743
+ self.ref_anchor_head = None
744
+
745
+ self.decoder_query_perturber = decoder_query_perturber
746
+ self.box_pred_damping = None
747
+
748
+ self.dec_layer_number = dec_layer_number
749
+ if dec_layer_number is not None:
750
+ assert isinstance(dec_layer_number, list)
751
+ assert len(dec_layer_number) == num_layers
752
+ # assert dec_layer_number[0] ==
753
+
754
+ self.dec_layer_dropout_prob = dec_layer_dropout_prob
755
+ if dec_layer_dropout_prob is not None:
756
+ assert isinstance(dec_layer_dropout_prob, list)
757
+ assert len(dec_layer_dropout_prob) == num_layers
758
+ for i in dec_layer_dropout_prob:
759
+ assert 0.0 <= i <= 1.0
760
+
761
+ self.rm_detach = None
762
+ self.num_body_points = num_body_points
763
+
764
+ self.hw = nn.Embedding(17, 2)
765
+ self.num_box_decoder_layers = num_box_decoder_layers
766
+ self.kpt_index = [x for x in range(50 * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
767
+ self.hw_append = nn.Embedding(self.num_body_points-17, 2)
768
+
769
+ def forward(self, tgt, memory,
770
+ tgt_mask: Optional[Tensor] = None,
771
+ tgt_mask2: Optional[Tensor] = None,
772
+ memory_mask: Optional[Tensor] = None,
773
+ tgt_key_padding_mask: Optional[Tensor] = None,
774
+ memory_key_padding_mask: Optional[Tensor] = None,
775
+ pos: Optional[Tensor] = None,
776
+ refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2
777
+ # for memory
778
+ level_start_index: Optional[Tensor] = None, # num_levels
779
+ spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
780
+ valid_ratios: Optional[Tensor] = None,
781
+ # for text
782
+ memory_text: Optional[Tensor] = None,
783
+ text_attention_mask: Optional[Tensor] = None,
784
+ text_dict: Optional[Tensor] = None,
785
+ dn_meta: Optional[Tensor] = None,
786
+ targets: Optional[Tensor] = None,
787
+ kpt_embed: Optional[Tensor] = None
788
+ ):
789
+ """
790
+ Input:
791
+ - tgt: nq, bs, d_model
792
+ - memory: hw, bs, d_model
793
+ - pos: hw, bs, d_model
794
+ - refpoints_unsigmoid: nq, bs, 2/4
795
+ - valid_ratios/spatial_shapes: bs, nlevel, 2
796
+ """
797
+
798
+ output = tgt
799
+ output += self.hw.weight[0, 0] * 0.0
800
+
801
+
802
+ intermediate = []
803
+ reference_points = refpoints_unsigmoid.sigmoid()
804
+ ref_points = [reference_points]
805
+ effect_num_dn = dn_meta['pad_size'] if self.training else 0
806
+ inter_select_number = 50
807
+ for layer_id, layer in enumerate(self.layers):
808
+
809
+ if reference_points.shape[-1] == 4:
810
+ reference_points_input = reference_points[:, :, None] \
811
+ * torch.cat([valid_ratios, valid_ratios], -1)[None, :] # nq, bs, nlevel, 4
812
+ else:
813
+ assert reference_points.shape[-1] == 2
814
+ reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
815
+ query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # nq, bs, 256*2
816
+
817
+ # conditional query
818
+ raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256
819
+ pos_scale = self.query_scale(output) if self.query_scale is not None else 1
820
+ query_pos = pos_scale * raw_query_pos
821
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
822
+ # if query_pos.isnan().any() | query_pos.isinf().any():
823
+ # import ipdb; ipdb.set_trace()
824
+
825
+ # main process
826
+ output = layer(
827
+ tgt=output,
828
+ tgt_query_pos=query_pos,
829
+ tgt_query_sine_embed=query_sine_embed,
830
+ tgt_key_padding_mask=tgt_key_padding_mask,
831
+ tgt_reference_points=reference_points_input,
832
+
833
+ memory_text=memory_text,
834
+ text_attention_mask=text_attention_mask,
835
+
836
+ memory=memory,
837
+ memory_key_padding_mask=memory_key_padding_mask,
838
+ memory_level_start_index=level_start_index,
839
+ memory_spatial_shapes=spatial_shapes,
840
+ memory_pos=pos,
841
+
842
+ self_attn_mask=tgt_mask,
843
+ cross_attn_mask=memory_mask
844
+ )
845
+ if output.isnan().any() | output.isinf().any():
846
+ print(f"output layer_id {layer_id} is nan")
847
+ try:
848
+ num_nan = output.isnan().sum().item()
849
+ num_inf = output.isinf().sum().item()
850
+ print(f"num_nan {num_nan}, num_inf {num_inf}")
851
+ except Exception as e:
852
+ print(e)
853
+
854
+
855
+
856
+
857
+ intermediate.append(self.norm(output))
858
+ # iter update
859
+ if layer_id < self.num_box_decoder_layers:
860
+ reference_before_sigmoid = inverse_sigmoid(reference_points)
861
+ delta_unsig = self.bbox_embed[layer_id](output)
862
+ outputs_unsig = delta_unsig + reference_before_sigmoid
863
+ new_reference_points = outputs_unsig.sigmoid()
864
+
865
+ # select # ref points as anchors
866
+ if layer_id == self.num_box_decoder_layers - 1:
867
+ dn_output = output[:effect_num_dn]
868
+ dn_new_reference_points = new_reference_points[:effect_num_dn]
869
+ class_unselected = self.class_embed[layer_id](output.transpose(0, 1), text_dict)[:,
870
+ effect_num_dn:].transpose(0, 1)
871
+ topk_proposals = torch.topk(class_unselected.max(-1)[0], inter_select_number, dim=0)[1]
872
+ new_reference_points_for_box = torch.gather(new_reference_points[effect_num_dn:], 0,
873
+ topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
874
+ new_output_for_box = torch.gather(output[effect_num_dn:], 0,
875
+ topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model))
876
+ keypoint_embed=kpt_embed.transpose(0, 1)
877
+
878
+ new_output_for_keypoint = keypoint_embed[None, :, :, :].repeat(new_output_for_box.shape[0],1,1,1)
879
+ delta_xy = self.pose_embed[-1](new_output_for_keypoint)[..., :2]
880
+ keypoint_xy = (inverse_sigmoid(new_reference_points_for_box[..., :2][:, None]) + delta_xy).sigmoid()
881
+ num_queries, _, bs, _ = keypoint_xy.shape
882
+ aa = torch.cat((self.hw.weight,self.hw_append.weight),dim=0)
883
+ keypoint_wh_weight = aa.unsqueeze(0).unsqueeze(-2).repeat(num_queries, 1, bs, 1).sigmoid()
884
+ keypoint_wh = keypoint_wh_weight * new_reference_points_for_box[..., 2:][:, None]
885
+ new_reference_points_for_keypoint = torch.cat((keypoint_xy, keypoint_wh), dim=-1)
886
+ new_reference_points = torch.cat(
887
+ (new_reference_points_for_box.unsqueeze(1), new_reference_points_for_keypoint), dim=1).flatten(0, 1)
888
+ output = torch.cat((new_output_for_box.unsqueeze(1), new_output_for_keypoint), dim=1).flatten(0, 1)
889
+ new_reference_points = torch.cat((dn_new_reference_points, new_reference_points), dim=0)
890
+ output = torch.cat((dn_output, output), dim=0)
891
+ tgt_mask = tgt_mask2
892
+
893
+ if layer_id >= self.num_box_decoder_layers:
894
+ reference_before_sigmoid = inverse_sigmoid(reference_points)
895
+ output_bbox_dn = output[:effect_num_dn]
896
+ output_bbox_norm = output[effect_num_dn:][0::(self.num_body_points + 1)]
897
+ reference_before_sigmoid_bbox_dn = reference_before_sigmoid[:effect_num_dn]
898
+ reference_before_sigmoid_bbox_norm = reference_before_sigmoid[effect_num_dn:][
899
+ 0::(self.num_body_points + 1)]
900
+ delta_unsig_dn = self.bbox_embed[layer_id](output_bbox_dn)
901
+ delta_unsig_norm = self.bbox_embed[layer_id](output_bbox_norm)
902
+ outputs_unsig_dn = delta_unsig_dn + reference_before_sigmoid_bbox_dn
903
+ outputs_unsig_norm = delta_unsig_norm + reference_before_sigmoid_bbox_norm
904
+ new_reference_points_for_box_dn = outputs_unsig_dn.sigmoid()
905
+ new_reference_points_for_box_norm = outputs_unsig_norm.sigmoid()
906
+ output_kpt = output[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index, device=output.device))
907
+ delta_xy_unsig = self.pose_embed[layer_id - self.num_box_decoder_layers](output_kpt)
908
+ outputs_unsig = reference_before_sigmoid[effect_num_dn:].index_select(0, torch.tensor(self.kpt_index,
909
+ device=output.device)).clone() ##
910
+ delta_hw_unsig = self.pose_hw_embed[layer_id - self.num_box_decoder_layers](output_kpt)
911
+ outputs_unsig[..., :2] += delta_xy_unsig[..., :2]
912
+ outputs_unsig[..., 2:] += delta_hw_unsig
913
+ new_reference_points_for_keypoint = outputs_unsig.sigmoid()
914
+ bs = new_reference_points_for_box_norm.shape[1]
915
+ new_reference_points_norm = torch.cat((new_reference_points_for_box_norm.unsqueeze(1),
916
+ new_reference_points_for_keypoint.view(-1, self.num_body_points,
917
+ bs, 4)), dim=1).flatten(0,
918
+ 1)
919
+ new_reference_points = torch.cat((new_reference_points_for_box_dn, new_reference_points_norm), dim=0)
920
+
921
+ if self.rm_detach and 'dec' in self.rm_detach:
922
+ reference_points = new_reference_points
923
+ else:
924
+ reference_points = new_reference_points.detach()
925
+
926
+ # if layer_id != self.num_layers - 1:
927
+ if self.use_detached_boxes_dec_out:
928
+ ref_points.append(reference_points)
929
+ else:
930
+ ref_points.append(new_reference_points)
931
+
932
+ return [
933
+ [itm_out.transpose(0, 1) for itm_out in intermediate],
934
+ [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]
935
+ ]
936
+
937
+
938
+ class DeformableTransformerEncoderLayer(nn.Module):
939
+ def __init__(self,
940
+ d_model=256, d_ffn=1024,
941
+ dropout=0.1, activation="relu",
942
+ n_levels=4, n_heads=8, n_points=4,
943
+ add_channel_attention=False,
944
+ use_deformable_box_attn=False,
945
+ box_attn_type='roi_align',
946
+ ):
947
+ super().__init__()
948
+
949
+ # self attention
950
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
951
+ self.dropout1 = nn.Dropout(dropout)
952
+ self.norm1 = nn.LayerNorm(d_model)
953
+
954
+ # ffn
955
+ self.linear1 = nn.Linear(d_model, d_ffn)
956
+ self.activation = _get_activation_fn(activation, d_model=d_ffn)
957
+ self.dropout2 = nn.Dropout(dropout)
958
+ self.linear2 = nn.Linear(d_ffn, d_model)
959
+ self.dropout3 = nn.Dropout(dropout)
960
+ self.norm2 = nn.LayerNorm(d_model)
961
+
962
+ # channel attention
963
+ self.add_channel_attention = add_channel_attention
964
+ if add_channel_attention:
965
+ self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
966
+ self.norm_channel = nn.LayerNorm(d_model)
967
+
968
+ @staticmethod
969
+ def with_pos_embed(tensor, pos):
970
+ return tensor if pos is None else tensor + pos
971
+
972
+ def forward_ffn(self, src):
973
+ src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
974
+ src = src + self.dropout3(src2)
975
+ src = self.norm2(src)
976
+ return src
977
+
978
+ def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
979
+ # self attention
980
+ # import ipdb; ipdb.set_trace()
981
+ src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index,
982
+ key_padding_mask)
983
+ src = src + self.dropout1(src2)
984
+ src = self.norm1(src)
985
+
986
+ # ffn
987
+ src = self.forward_ffn(src)
988
+
989
+ # channel attn
990
+ if self.add_channel_attention:
991
+ src = self.norm_channel(src + self.activ_channel(src))
992
+
993
+ return src
994
+
995
+
996
+ class DeformableTransformerDecoderLayer(nn.Module):
997
+ def __init__(self, d_model=256, d_ffn=1024,
998
+ dropout=0.1, activation="relu",
999
+ n_levels=4, n_heads=8, n_points=4,
1000
+ use_text_feat_guide=False,
1001
+ use_text_cross_attention=False,
1002
+ ffn_extra_layernorm=False
1003
+ ):
1004
+ super().__init__()
1005
+
1006
+ # cross attention
1007
+ # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
1008
+ self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
1009
+ self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1010
+ self.norm1 = nn.LayerNorm(d_model)
1011
+
1012
+ # cross attention text
1013
+ if use_text_cross_attention:
1014
+ self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
1015
+ self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1016
+ self.catext_norm = nn.LayerNorm(d_model)
1017
+
1018
+ # self attention
1019
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
1020
+ self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1021
+ self.norm2 = nn.LayerNorm(d_model)
1022
+
1023
+ # ffn
1024
+ self.linear1 = nn.Linear(d_model, d_ffn)
1025
+ self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
1026
+ self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1027
+ self.linear2 = nn.Linear(d_ffn, d_model)
1028
+ self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
1029
+ self.norm3 = nn.LayerNorm(d_model)
1030
+ if ffn_extra_layernorm:
1031
+ raise NotImplementedError('ffn_extra_layernorm not implemented')
1032
+ self.norm_ext = nn.LayerNorm(d_ffn)
1033
+ else:
1034
+ self.norm_ext = None
1035
+
1036
+ self.key_aware_proj = None
1037
+ self.use_text_feat_guide = use_text_feat_guide
1038
+ assert not use_text_feat_guide
1039
+ self.use_text_cross_attention = use_text_cross_attention
1040
+
1041
+ def rm_self_attn_modules(self):
1042
+ self.self_attn = None
1043
+ self.dropout2 = None
1044
+ self.norm2 = None
1045
+
1046
+ @staticmethod
1047
+ def with_pos_embed(tensor, pos):
1048
+ return tensor if pos is None else tensor + pos
1049
+
1050
+ def forward_ffn(self, tgt, ipdb_flag=False):
1051
+
1052
+ with torch.cuda.amp.autocast(enabled=False):
1053
+ tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
1054
+
1055
+ tgt = tgt + self.dropout4(tgt2)
1056
+ tgt = self.norm3(tgt)
1057
+ return tgt
1058
+
1059
+ def forward(self,
1060
+ # for tgt
1061
+ tgt: Optional[Tensor], # nq, bs, d_model
1062
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
1063
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
1064
+ tgt_key_padding_mask: Optional[Tensor] = None,
1065
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
1066
+
1067
+ memory_text: Optional[Tensor] = None, # bs, num_token, d_model
1068
+ text_attention_mask: Optional[Tensor] = None, # bs, num_token
1069
+
1070
+ # for memory
1071
+ memory: Optional[Tensor] = None, # hw, bs, d_model
1072
+ memory_key_padding_mask: Optional[Tensor] = None,
1073
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
1074
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
1075
+ memory_pos: Optional[Tensor] = None, # pos for memory
1076
+
1077
+ # sa
1078
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
1079
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
1080
+ ):
1081
+ """
1082
+ Input:
1083
+ - tgt/tgt_query_pos: nq, bs, d_model
1084
+ -
1085
+ """
1086
+ assert cross_attn_mask is None
1087
+
1088
+ # self attention
1089
+ if self.self_attn is not None:
1090
+ # import ipdb; ipdb.set_trace()
1091
+ q = k = self.with_pos_embed(tgt, tgt_query_pos)
1092
+ tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
1093
+ tgt = tgt + self.dropout2(tgt2)
1094
+ tgt = self.norm2(tgt)
1095
+
1096
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1097
+ # if tgt.isnan().any() | tgt.isinf().any() :
1098
+ # import ipdb; ipdb.set_trace()
1099
+
1100
+ if self.use_text_cross_attention:
1101
+ tgt2 = self.ca_text(self.with_pos_embed(tgt, tgt_query_pos), memory_text.transpose(0, 1),
1102
+ memory_text.transpose(0, 1), key_padding_mask=text_attention_mask)[0]
1103
+ tgt = tgt + self.catext_dropout(tgt2)
1104
+ tgt = self.catext_norm(tgt)
1105
+
1106
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1107
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
1108
+ # import ipdb; ipdb.set_trace()
1109
+
1110
+ # if tgt.isnan().any() | tgt.isinf().any() :
1111
+ # import ipdb; ipdb.set_trace()
1112
+
1113
+ tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
1114
+ tgt_reference_points.transpose(0, 1).contiguous(),
1115
+ memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index,
1116
+ memory_key_padding_mask).transpose(0, 1)
1117
+ tgt = tgt + self.dropout1(tgt2)
1118
+ tgt = self.norm1(tgt)
1119
+
1120
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1121
+ # tgtk = tgt.clone()
1122
+ # if tgt.isnan().any() | tgt.isinf().any() :
1123
+ # import ipdb; ipdb.set_trace()
1124
+
1125
+ # ffn
1126
+ tgt = self.forward_ffn(tgt)
1127
+ # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
1128
+ # if tgt.isnan().any() | tgt.isinf().any() :
1129
+ # tgtk = self.forward_ffn(tgtk, ipdb_flag=True)
1130
+ # import ipdb; ipdb.set_trace()
1131
+
1132
+ return tgt
1133
+
1134
+
1135
+ def _get_clones(module, N, layer_share=False):
1136
+ # import ipdb; ipdb.set_trace()
1137
+ if layer_share:
1138
+ return nn.ModuleList([module for i in range(N)])
1139
+ else:
1140
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
1141
+
1142
+
1143
+ def build_deformable_transformer(args):
1144
+ decoder_query_perturber = None
1145
+ if args.decoder_layer_noise:
1146
+ from .utils import RandomBoxPerturber
1147
+ decoder_query_perturber = RandomBoxPerturber(
1148
+ x_noise_scale=args.dln_xy_noise, y_noise_scale=args.dln_xy_noise,
1149
+ w_noise_scale=args.dln_hw_noise, h_noise_scale=args.dln_hw_noise)
1150
+
1151
+ use_detached_boxes_dec_out = False
1152
+ try:
1153
+ use_detached_boxes_dec_out = args.use_detached_boxes_dec_out
1154
+ except:
1155
+ use_detached_boxes_dec_out = False
1156
+
1157
+ binary_query_selection = False
1158
+ try:
1159
+ binary_query_selection = args.binary_query_selection
1160
+ except:
1161
+ binary_query_selection = False
1162
+
1163
+ ffn_extra_layernorm = False
1164
+ try:
1165
+ ffn_extra_layernorm = args.ffn_extra_layernorm
1166
+ except:
1167
+ print('ffn_extra_layernorm not found, set to False')
1168
+ ffn_extra_layernorm = False
1169
+
1170
+ return DeformableTransformer(
1171
+ d_model=args.hidden_dim,
1172
+ dropout=args.dropout,
1173
+ nhead=args.nheads,
1174
+ num_queries=args.num_queries,
1175
+ dim_feedforward=args.dim_feedforward,
1176
+ num_encoder_layers=args.enc_layers,
1177
+ num_unicoder_layers=args.unic_layers,
1178
+ num_decoder_layers=args.dec_layers,
1179
+ normalize_before=args.pre_norm,
1180
+ return_intermediate_dec=True,
1181
+ query_dim=args.query_dim,
1182
+ activation=args.transformer_activation,
1183
+ num_patterns=args.num_patterns,
1184
+ modulate_hw_attn=True,
1185
+
1186
+ deformable_encoder=True,
1187
+ deformable_decoder=True,
1188
+ num_feature_levels=args.num_feature_levels,
1189
+ enc_n_points=args.enc_n_points,
1190
+ dec_n_points=args.dec_n_points,
1191
+ use_deformable_box_attn=args.use_deformable_box_attn,
1192
+ box_attn_type=args.box_attn_type,
1193
+
1194
+ learnable_tgt_init=True,
1195
+ decoder_query_perturber=decoder_query_perturber,
1196
+
1197
+ add_channel_attention=args.add_channel_attention,
1198
+ add_pos_value=args.add_pos_value,
1199
+ random_refpoints_xy=args.random_refpoints_xy,
1200
+
1201
+ # two stage
1202
+ two_stage_type=args.two_stage_type, # ['no', 'standard', 'early']
1203
+ two_stage_pat_embed=args.two_stage_pat_embed,
1204
+ two_stage_add_query_num=args.two_stage_add_query_num,
1205
+ two_stage_learn_wh=args.two_stage_learn_wh,
1206
+ two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
1207
+ dec_layer_number=args.dec_layer_number,
1208
+ rm_self_attn_layers=None,
1209
+ key_aware_type=None,
1210
+ layer_share_type=None,
1211
+
1212
+ rm_detach=None,
1213
+ decoder_sa_type=args.decoder_sa_type,
1214
+ module_seq=args.decoder_module_seq,
1215
+
1216
+ embed_init_tgt=args.embed_init_tgt,
1217
+ use_detached_boxes_dec_out=use_detached_boxes_dec_out,
1218
+ use_text_enhancer=args.use_text_enhancer,
1219
+ use_fusion_layer=args.use_fusion_layer,
1220
+ use_checkpoint=args.use_checkpoint,
1221
+ use_transformer_ckpt=args.use_transformer_ckpt,
1222
+ use_text_cross_attention=args.use_text_cross_attention,
1223
+
1224
+ text_dropout=args.text_dropout,
1225
+ fusion_dropout=args.fusion_dropout,
1226
+ fusion_droppath=args.fusion_droppath,
1227
+
1228
+ binary_query_selection=binary_query_selection,
1229
+ ffn_extra_layernorm=ffn_extra_layernorm,
1230
+ )
python/utils/dependencies/XPose/models/UniPose/fuse_modules.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ # from timm.models.layers import DropPath
6
+ from src.modules.util import DropPath
7
+
8
+ class FeatureResizer(nn.Module):
9
+ """
10
+ This class takes as input a set of embeddings of dimension C1 and outputs a set of
11
+ embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
12
+ """
13
+
14
+ def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
15
+ super().__init__()
16
+ self.do_ln = do_ln
17
+ # Object feature encoding
18
+ self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
19
+ self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
20
+ self.dropout = nn.Dropout(dropout)
21
+
22
+ def forward(self, encoder_features):
23
+ x = self.fc(encoder_features)
24
+ if self.do_ln:
25
+ x = self.layer_norm(x)
26
+ output = self.dropout(x)
27
+ return output
28
+
29
+
30
+
31
+
32
+ def l1norm(X, dim, eps=1e-8):
33
+ """L1-normalize columns of X
34
+ """
35
+ norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
36
+ X = torch.div(X, norm)
37
+ return X
38
+
39
+
40
+ def l2norm(X, dim, eps=1e-8):
41
+ """L2-normalize columns of X
42
+ """
43
+ norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
44
+ X = torch.div(X, norm)
45
+ return X
46
+
47
+
48
+ def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
49
+ """
50
+ query: (n_context, queryL, d)
51
+ context: (n_context, sourceL, d)
52
+ """
53
+ batch_size_q, queryL = query.size(0), query.size(1)
54
+ batch_size, sourceL = context.size(0), context.size(1)
55
+
56
+ # Get attention
57
+ # --> (batch, d, queryL)
58
+ queryT = torch.transpose(query, 1, 2)
59
+
60
+ # (batch, sourceL, d)(batch, d, queryL)
61
+ # --> (batch, sourceL, queryL)
62
+ attn = torch.bmm(context, queryT)
63
+ if raw_feature_norm == "softmax":
64
+ # --> (batch*sourceL, queryL)
65
+ attn = attn.view(batch_size * sourceL, queryL)
66
+ attn = nn.Softmax()(attn)
67
+ # --> (batch, sourceL, queryL)
68
+ attn = attn.view(batch_size, sourceL, queryL)
69
+ elif raw_feature_norm == "l2norm":
70
+ attn = l2norm(attn, 2)
71
+ elif raw_feature_norm == "clipped_l2norm":
72
+ attn = nn.LeakyReLU(0.1)(attn)
73
+ attn = l2norm(attn, 2)
74
+ else:
75
+ raise ValueError("unknown first norm type:", raw_feature_norm)
76
+ # --> (batch, queryL, sourceL)
77
+ attn = torch.transpose(attn, 1, 2).contiguous()
78
+ # --> (batch*queryL, sourceL)
79
+ attn = attn.view(batch_size * queryL, sourceL)
80
+ attn = nn.Softmax()(attn * smooth)
81
+ # --> (batch, queryL, sourceL)
82
+ attn = attn.view(batch_size, queryL, sourceL)
83
+ # --> (batch, sourceL, queryL)
84
+ attnT = torch.transpose(attn, 1, 2).contiguous()
85
+
86
+ # --> (batch, d, sourceL)
87
+ contextT = torch.transpose(context, 1, 2)
88
+ # (batch x d x sourceL)(batch x sourceL x queryL)
89
+ # --> (batch, d, queryL)
90
+ weightedContext = torch.bmm(contextT, attnT)
91
+ # --> (batch, queryL, d)
92
+ weightedContext = torch.transpose(weightedContext, 1, 2)
93
+
94
+ return weightedContext, attnT
95
+
96
+
97
+ class BiMultiHeadAttention(nn.Module):
98
+ def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
99
+ super(BiMultiHeadAttention, self).__init__()
100
+
101
+ self.embed_dim = embed_dim
102
+ self.num_heads = num_heads
103
+ self.head_dim = embed_dim // num_heads
104
+ self.v_dim = v_dim
105
+ self.l_dim = l_dim
106
+
107
+ assert (
108
+ self.head_dim * self.num_heads == self.embed_dim
109
+ ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
110
+ self.scale = self.head_dim ** (-0.5)
111
+ self.dropout = dropout
112
+
113
+ self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
114
+ self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
115
+ self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
116
+ self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
117
+
118
+ self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
119
+ self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
120
+
121
+ self.stable_softmax_2d = True
122
+ self.clamp_min_for_underflow = True
123
+ self.clamp_max_for_overflow = True
124
+
125
+ self._reset_parameters()
126
+
127
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
128
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
129
+
130
+ def _reset_parameters(self):
131
+ nn.init.xavier_uniform_(self.v_proj.weight)
132
+ self.v_proj.bias.data.fill_(0)
133
+ nn.init.xavier_uniform_(self.l_proj.weight)
134
+ self.l_proj.bias.data.fill_(0)
135
+ nn.init.xavier_uniform_(self.values_v_proj.weight)
136
+ self.values_v_proj.bias.data.fill_(0)
137
+ nn.init.xavier_uniform_(self.values_l_proj.weight)
138
+ self.values_l_proj.bias.data.fill_(0)
139
+ nn.init.xavier_uniform_(self.out_v_proj.weight)
140
+ self.out_v_proj.bias.data.fill_(0)
141
+ nn.init.xavier_uniform_(self.out_l_proj.weight)
142
+ self.out_l_proj.bias.data.fill_(0)
143
+
144
+ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
145
+ """_summary_
146
+
147
+ Args:
148
+ v (_type_): bs, n_img, dim
149
+ l (_type_): bs, n_text, dim
150
+ attention_mask_v (_type_, optional): _description_. bs, n_img
151
+ attention_mask_l (_type_, optional): _description_. bs, n_text
152
+
153
+ Returns:
154
+ _type_: _description_
155
+ """
156
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
157
+ # import ipdb; ipdb.set_trace()
158
+ bsz, tgt_len, _ = v.size()
159
+
160
+ query_states = self.v_proj(v) * self.scale
161
+ key_states = self._shape(self.l_proj(l), -1, bsz)
162
+ value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
163
+ value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
164
+
165
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
166
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
167
+ key_states = key_states.view(*proj_shape)
168
+ value_v_states = value_v_states.view(*proj_shape)
169
+ value_l_states = value_l_states.view(*proj_shape)
170
+
171
+ src_len = key_states.size(1)
172
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt
173
+
174
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
175
+ raise ValueError(
176
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
177
+ )
178
+
179
+ if self.stable_softmax_2d:
180
+ attn_weights = attn_weights - attn_weights.max()
181
+
182
+ if self.clamp_min_for_underflow:
183
+ attn_weights = torch.clamp(attn_weights, min=-50000) # Do not increase -50000, data type half has quite limited range
184
+ if self.clamp_max_for_overflow:
185
+ attn_weights = torch.clamp(attn_weights, max=50000) # Do not increase 50000, data type half has quite limited range
186
+
187
+ attn_weights_T = attn_weights.transpose(1, 2)
188
+ attn_weights_l = (attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[
189
+ 0])
190
+ if self.clamp_min_for_underflow:
191
+ attn_weights_l = torch.clamp(attn_weights_l, min=-50000) # Do not increase -50000, data type half has quite limited range
192
+ if self.clamp_max_for_overflow:
193
+ attn_weights_l = torch.clamp(attn_weights_l, max=50000) # Do not increase 50000, data type half has quite limited range
194
+
195
+ # mask vison for language
196
+ if attention_mask_v is not None:
197
+ attention_mask_v = attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
198
+ attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
199
+
200
+ attn_weights_l = attn_weights_l.softmax(dim=-1)
201
+
202
+ # mask language for vision
203
+ if attention_mask_l is not None:
204
+ attention_mask_l = attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
205
+ attn_weights.masked_fill_(attention_mask_l, float('-inf'))
206
+ attn_weights_v = attn_weights.softmax(dim=-1)
207
+
208
+ attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
209
+ attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
210
+
211
+ attn_output_v = torch.bmm(attn_probs_v, value_l_states)
212
+ attn_output_l = torch.bmm(attn_probs_l, value_v_states)
213
+
214
+
215
+ if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
216
+ raise ValueError(
217
+ f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
218
+ )
219
+
220
+ if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
221
+ raise ValueError(
222
+ f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
223
+ )
224
+
225
+ attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
226
+ attn_output_v = attn_output_v.transpose(1, 2)
227
+ attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
228
+
229
+ attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
230
+ attn_output_l = attn_output_l.transpose(1, 2)
231
+ attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
232
+
233
+ attn_output_v = self.out_v_proj(attn_output_v)
234
+ attn_output_l = self.out_l_proj(attn_output_l)
235
+
236
+ return attn_output_v, attn_output_l
237
+
238
+
239
+ # Bi-Direction MHA (text->image, image->text)
240
+ class BiAttentionBlock(nn.Module):
241
+ def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1,
242
+ drop_path=.0, init_values=1e-4, cfg=None):
243
+ """
244
+ Inputs:
245
+ embed_dim - Dimensionality of input and attention feature vectors
246
+ hidden_dim - Dimensionality of hidden layer in feed-forward network
247
+ (usually 2-4x larger than embed_dim)
248
+ num_heads - Number of heads to use in the Multi-Head Attention block
249
+ dropout - Amount of dropout to apply in the feed-forward network
250
+ """
251
+ super(BiAttentionBlock, self).__init__()
252
+
253
+ # pre layer norm
254
+ self.layer_norm_v = nn.LayerNorm(v_dim)
255
+ self.layer_norm_l = nn.LayerNorm(l_dim)
256
+ self.attn = BiMultiHeadAttention(v_dim=v_dim,
257
+ l_dim=l_dim,
258
+ embed_dim=embed_dim,
259
+ num_heads=num_heads,
260
+ dropout=dropout)
261
+
262
+ # add layer scale for training stability
263
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
264
+ self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=False)
265
+ self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=False)
266
+
267
+ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
268
+ v = self.layer_norm_v(v)
269
+ l = self.layer_norm_l(l)
270
+ delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l)
271
+ # v, l = v + delta_v, l + delta_l
272
+ v = v + self.drop_path(self.gamma_v * delta_v)
273
+ l = l + self.drop_path(self.gamma_l * delta_l)
274
+ return v, l
python/utils/dependencies/XPose/models/UniPose/mask_generate.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def prepare_for_mask(kpt_mask):
5
+
6
+
7
+ tgt_size2 = 50 * 69
8
+ attn_mask2 = torch.ones(kpt_mask.shape[0], 8, tgt_size2, tgt_size2).to('cuda') < 0
9
+ group_bbox_kpt = 69
10
+ num_group=50
11
+ for matchj in range(num_group * group_bbox_kpt):
12
+ sj = (matchj // group_bbox_kpt) * group_bbox_kpt
13
+ ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
14
+ if sj > 0:
15
+ attn_mask2[:,:,matchj, :sj] = True
16
+ if ej < num_group * group_bbox_kpt:
17
+ attn_mask2[:,:,matchj, ej:] = True
18
+
19
+
20
+ bs, length = kpt_mask.shape
21
+ equal_mask = kpt_mask[:, :, None] == kpt_mask[:, None, :]
22
+ equal_mask= equal_mask.unsqueeze(1).repeat(1,8,1,1)
23
+ for idx in range(num_group):
24
+ start_idx = idx * length
25
+ end_idx = (idx + 1) * length
26
+ attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][equal_mask] = False
27
+ attn_mask2[:, :,start_idx:end_idx, start_idx:end_idx][~equal_mask] = True
28
+
29
+
30
+
31
+
32
+ input_query_label = None
33
+ input_query_bbox = None
34
+ attn_mask = None
35
+ dn_meta = None
36
+
37
+ return input_query_label, input_query_bbox, attn_mask, attn_mask2.flatten(0,1), dn_meta
38
+
39
+
40
+ def post_process(outputs_class, outputs_coord, dn_meta, aux_loss, _set_aux_loss):
41
+
42
+ if dn_meta and dn_meta['pad_size'] > 0:
43
+
44
+ output_known_class = [outputs_class_i[:, :dn_meta['pad_size'], :] for outputs_class_i in outputs_class]
45
+ output_known_coord = [outputs_coord_i[:, :dn_meta['pad_size'], :] for outputs_coord_i in outputs_coord]
46
+
47
+ outputs_class = [outputs_class_i[:, dn_meta['pad_size']:, :] for outputs_class_i in outputs_class]
48
+ outputs_coord = [outputs_coord_i[:, dn_meta['pad_size']:, :] for outputs_coord_i in outputs_coord]
49
+
50
+ out = {'pred_logits': output_known_class[-1], 'pred_boxes': output_known_coord[-1]}
51
+ if aux_loss:
52
+ out['aux_outputs'] = _set_aux_loss(output_known_class, output_known_coord)
53
+ dn_meta['output_known_lbs_bboxes'] = out
54
+ return outputs_class, outputs_coord
55
+
56
+
python/utils/dependencies/XPose/models/UniPose/ops/functions/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from .ms_deform_attn_func import MSDeformAttnFunction
10
+
python/utils/dependencies/XPose/models/UniPose/ops/functions/ms_deform_attn_func.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import torch
14
+ import torch.nn.functional as F
15
+ from torch.autograd import Function
16
+ from torch.autograd.function import once_differentiable
17
+
18
+ import MultiScaleDeformableAttention as MSDA
19
+
20
+
21
+ class MSDeformAttnFunction(Function):
22
+ @staticmethod
23
+ def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
24
+ ctx.im2col_step = im2col_step
25
+ output = MSDA.ms_deform_attn_forward(
26
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
27
+ ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
28
+ return output
29
+
30
+ @staticmethod
31
+ @once_differentiable
32
+ def backward(ctx, grad_output):
33
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
34
+ grad_value, grad_sampling_loc, grad_attn_weight = \
35
+ MSDA.ms_deform_attn_backward(
36
+ value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
37
+
38
+ return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
39
+
40
+
41
+ def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
42
+ # for debug and test only,
43
+ # need to use cuda version instead
44
+ N_, S_, M_, D_ = value.shape
45
+ _, Lq_, M_, L_, P_, _ = sampling_locations.shape
46
+ value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
47
+ sampling_grids = 2 * sampling_locations - 1
48
+ sampling_value_list = []
49
+ for lid_, (H_, W_) in enumerate(value_spatial_shapes):
50
+ # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
51
+ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
52
+ # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
53
+ sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
54
+ # N_*M_, D_, Lq_, P_
55
+ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
56
+ mode='bilinear', padding_mode='zeros', align_corners=False)
57
+ sampling_value_list.append(sampling_value_l_)
58
+ # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
59
+ attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
60
+ output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
61
+ return output.transpose(1, 2).contiguous()
python/utils/dependencies/XPose/models/UniPose/ops/modules/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from .ms_deform_attn import MSDeformAttn
python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import warnings
14
+ import math, os
15
+ import sys
16
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ import torch
19
+ from torch import nn
20
+ import torch.nn.functional as F
21
+ from torch.nn.init import xavier_uniform_, constant_
22
+
23
+ from src.utils.dependencies.XPose.models.UniPose.ops.functions.ms_deform_attn_func import MSDeformAttnFunction
24
+
25
+
26
+ def _is_power_of_2(n):
27
+ if (not isinstance(n, int)) or (n < 0):
28
+ raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
29
+ return (n & (n-1) == 0) and n != 0
30
+
31
+
32
+ class MSDeformAttn(nn.Module):
33
+ def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
34
+ """
35
+ Multi-Scale Deformable Attention Module
36
+ :param d_model hidden dimension
37
+ :param n_levels number of feature levels
38
+ :param n_heads number of attention heads
39
+ :param n_points number of sampling points per attention head per feature level
40
+ """
41
+ super().__init__()
42
+ if d_model % n_heads != 0:
43
+ raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
44
+ _d_per_head = d_model // n_heads
45
+ # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
46
+ if not _is_power_of_2(_d_per_head):
47
+ warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
48
+ "which is more efficient in our CUDA implementation.")
49
+
50
+ self.im2col_step = 64
51
+
52
+ self.d_model = d_model
53
+ self.n_levels = n_levels
54
+ self.n_heads = n_heads
55
+ self.n_points = n_points
56
+
57
+ self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
58
+ self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
59
+ self.value_proj = nn.Linear(d_model, d_model)
60
+ self.output_proj = nn.Linear(d_model, d_model)
61
+
62
+ self.use_4D_normalizer = use_4D_normalizer
63
+
64
+ self._reset_parameters()
65
+
66
+ def _reset_parameters(self):
67
+ constant_(self.sampling_offsets.weight.data, 0.)
68
+ thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
69
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
70
+ grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
71
+ for i in range(self.n_points):
72
+ grid_init[:, :, i, :] *= i + 1
73
+ with torch.no_grad():
74
+ self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
75
+ constant_(self.attention_weights.weight.data, 0.)
76
+ constant_(self.attention_weights.bias.data, 0.)
77
+ xavier_uniform_(self.value_proj.weight.data)
78
+ constant_(self.value_proj.bias.data, 0.)
79
+ xavier_uniform_(self.output_proj.weight.data)
80
+ constant_(self.output_proj.bias.data, 0.)
81
+
82
+ def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
83
+ """
84
+ :param query (N, Length_{query}, C)
85
+ :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
86
+ or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
87
+ :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
88
+ :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
89
+ :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
90
+ :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
91
+
92
+ :return output (N, Length_{query}, C)
93
+ """
94
+ N, Len_q, _ = query.shape
95
+ N, Len_in, _ = input_flatten.shape
96
+ assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
97
+
98
+ value = self.value_proj(input_flatten)
99
+ if input_padding_mask is not None:
100
+ value = value.masked_fill(input_padding_mask[..., None], float(0))
101
+ value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
102
+ sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
103
+ attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
104
+ attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
105
+ # N, Len_q, n_heads, n_levels, n_points, 2
106
+
107
+ # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
108
+ # import ipdb; ipdb.set_trace()
109
+
110
+ if reference_points.shape[-1] == 2:
111
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
112
+ sampling_locations = reference_points[:, :, None, :, None, :] \
113
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
114
+ elif reference_points.shape[-1] == 4:
115
+ if self.use_4D_normalizer:
116
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
117
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
118
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
119
+ else:
120
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
121
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
122
+ else:
123
+ raise ValueError(
124
+ 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
125
+
126
+
127
+ # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
128
+ # import ipdb; ipdb.set_trace()
129
+
130
+ # for amp
131
+ if value.dtype == torch.float16:
132
+ # for mixed precision
133
+ output = MSDeformAttnFunction.apply(
134
+ value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
135
+ output = output.to(torch.float16)
136
+ output = self.output_proj(output)
137
+ return output
138
+
139
+ output = MSDeformAttnFunction.apply(
140
+ value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
141
+ output = self.output_proj(output)
142
+ return output
python/utils/dependencies/XPose/models/UniPose/ops/modules/ms_deform_attn_key_aware.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import warnings
14
+ import math, os
15
+
16
+ import torch
17
+ from torch import nn
18
+ import torch.nn.functional as F
19
+ from torch.nn.init import xavier_uniform_, constant_
20
+
21
+ try:
22
+ from src.utils.dependencies.XPose.models.UniPose.ops.functions import MSDeformAttnFunction
23
+ except:
24
+ warnings.warn('Failed to import MSDeformAttnFunction.')
25
+
26
+
27
+ def _is_power_of_2(n):
28
+ if (not isinstance(n, int)) or (n < 0):
29
+ raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
30
+ return (n & (n-1) == 0) and n != 0
31
+
32
+
33
+ class MSDeformAttn(nn.Module):
34
+ def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
35
+ """
36
+ Multi-Scale Deformable Attention Module
37
+ :param d_model hidden dimension
38
+ :param n_levels number of feature levels
39
+ :param n_heads number of attention heads
40
+ :param n_points number of sampling points per attention head per feature level
41
+ """
42
+ super().__init__()
43
+ if d_model % n_heads != 0:
44
+ raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
45
+ _d_per_head = d_model // n_heads
46
+ # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
47
+ if not _is_power_of_2(_d_per_head):
48
+ warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
49
+ "which is more efficient in our CUDA implementation.")
50
+
51
+ self.im2col_step = 64
52
+
53
+ self.d_model = d_model
54
+ self.n_levels = n_levels
55
+ self.n_heads = n_heads
56
+ self.n_points = n_points
57
+
58
+ self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
59
+ self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
60
+ self.value_proj = nn.Linear(d_model, d_model)
61
+ self.output_proj = nn.Linear(d_model, d_model)
62
+
63
+ self.use_4D_normalizer = use_4D_normalizer
64
+
65
+ self._reset_parameters()
66
+
67
+ def _reset_parameters(self):
68
+ constant_(self.sampling_offsets.weight.data, 0.)
69
+ thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
70
+ grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
71
+ grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
72
+ for i in range(self.n_points):
73
+ grid_init[:, :, i, :] *= i + 1
74
+ with torch.no_grad():
75
+ self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
76
+ constant_(self.attention_weights.weight.data, 0.)
77
+ constant_(self.attention_weights.bias.data, 0.)
78
+ xavier_uniform_(self.value_proj.weight.data)
79
+ constant_(self.value_proj.bias.data, 0.)
80
+ xavier_uniform_(self.output_proj.weight.data)
81
+ constant_(self.output_proj.bias.data, 0.)
82
+
83
+ def forward(self, query, key, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
84
+ """
85
+ :param query (N, Length_{query}, C)
86
+ :param key (N, 1, C)
87
+ :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
88
+ or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
89
+ :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
90
+ :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
91
+ :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
92
+ :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
93
+
94
+ :return output (N, Length_{query}, C)
95
+ """
96
+ N, Len_q, _ = query.shape
97
+ N, Len_in, _ = input_flatten.shape
98
+ assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
99
+
100
+ value = self.value_proj(input_flatten)
101
+ if input_padding_mask is not None:
102
+ value = value.masked_fill(input_padding_mask[..., None], float(0))
103
+ value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
104
+ sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
105
+ attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
106
+ attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
107
+ # N, Len_q, n_heads, n_levels, n_points, 2
108
+
109
+ # if os.environ.get('IPDB_DEBUG_SHILONG', False) == 'INFO':
110
+ # import ipdb; ipdb.set_trace()
111
+
112
+ if reference_points.shape[-1] == 2:
113
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
114
+ sampling_locations = reference_points[:, :, None, :, None, :] \
115
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
116
+ elif reference_points.shape[-1] == 4:
117
+ if self.use_4D_normalizer:
118
+ offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
119
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
120
+ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
121
+ else:
122
+ sampling_locations = reference_points[:, :, None, :, None, :2] \
123
+ + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
124
+ else:
125
+ raise ValueError(
126
+ 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
127
+ output = MSDeformAttnFunction.apply(
128
+ value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
129
+ output = self.output_proj(output)
130
+ return output
python/utils/dependencies/XPose/models/UniPose/ops/setup.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ import os
10
+ import glob
11
+
12
+ import torch
13
+
14
+ from torch.utils.cpp_extension import CUDA_HOME
15
+ from torch.utils.cpp_extension import CppExtension
16
+ from torch.utils.cpp_extension import CUDAExtension
17
+
18
+ from setuptools import find_packages
19
+ from setuptools import setup
20
+
21
+ requirements = ["torch", "torchvision"]
22
+
23
+ def get_extensions():
24
+ this_dir = os.path.dirname(os.path.abspath(__file__))
25
+ extensions_dir = os.path.join(this_dir, "src")
26
+
27
+ main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28
+ source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29
+ source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30
+
31
+ sources = main_file + source_cpu
32
+ extension = CppExtension
33
+ extra_compile_args = {"cxx": []}
34
+ define_macros = []
35
+
36
+ # import ipdb; ipdb.set_trace()
37
+
38
+ if torch.cuda.is_available() and CUDA_HOME is not None:
39
+ extension = CUDAExtension
40
+ sources += source_cuda
41
+ define_macros += [("WITH_CUDA", None)]
42
+ extra_compile_args["nvcc"] = [
43
+ "-DCUDA_HAS_FP16=1",
44
+ "-D__CUDA_NO_HALF_OPERATORS__",
45
+ "-D__CUDA_NO_HALF_CONVERSIONS__",
46
+ "-D__CUDA_NO_HALF2_OPERATORS__",
47
+ ]
48
+ else:
49
+ raise NotImplementedError('Cuda is not availabel')
50
+
51
+ sources = [os.path.join(extensions_dir, s) for s in sources]
52
+ include_dirs = [extensions_dir]
53
+ ext_modules = [
54
+ extension(
55
+ "MultiScaleDeformableAttention",
56
+ sources,
57
+ include_dirs=include_dirs,
58
+ define_macros=define_macros,
59
+ extra_compile_args=extra_compile_args,
60
+ )
61
+ ]
62
+ return ext_modules
63
+
64
+ setup(
65
+ name="MultiScaleDeformableAttention",
66
+ version="1.0",
67
+ author="Weijie Su",
68
+ url="https://github.com/fundamentalvision/Deformable-DETR",
69
+ description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
70
+ packages=find_packages(exclude=("configs", "tests",)),
71
+ ext_modules=get_extensions(),
72
+ cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
73
+ )
python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.cpp ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #include <vector>
12
+
13
+ #include <ATen/ATen.h>
14
+ #include <ATen/cuda/CUDAContext.h>
15
+
16
+
17
+ at::Tensor
18
+ ms_deform_attn_cpu_forward(
19
+ const at::Tensor &value,
20
+ const at::Tensor &spatial_shapes,
21
+ const at::Tensor &level_start_index,
22
+ const at::Tensor &sampling_loc,
23
+ const at::Tensor &attn_weight,
24
+ const int im2col_step)
25
+ {
26
+ AT_ERROR("Not implement on cpu");
27
+ }
28
+
29
+ std::vector<at::Tensor>
30
+ ms_deform_attn_cpu_backward(
31
+ const at::Tensor &value,
32
+ const at::Tensor &spatial_shapes,
33
+ const at::Tensor &level_start_index,
34
+ const at::Tensor &sampling_loc,
35
+ const at::Tensor &attn_weight,
36
+ const at::Tensor &grad_output,
37
+ const int im2col_step)
38
+ {
39
+ AT_ERROR("Not implement on cpu");
40
+ }
41
+
python/utils/dependencies/XPose/models/UniPose/ops/src/cpu/ms_deform_attn_cpu.h ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #pragma once
12
+ #include <torch/extension.h>
13
+
14
+ at::Tensor
15
+ ms_deform_attn_cpu_forward(
16
+ const at::Tensor &value,
17
+ const at::Tensor &spatial_shapes,
18
+ const at::Tensor &level_start_index,
19
+ const at::Tensor &sampling_loc,
20
+ const at::Tensor &attn_weight,
21
+ const int im2col_step);
22
+
23
+ std::vector<at::Tensor>
24
+ ms_deform_attn_cpu_backward(
25
+ const at::Tensor &value,
26
+ const at::Tensor &spatial_shapes,
27
+ const at::Tensor &level_start_index,
28
+ const at::Tensor &sampling_loc,
29
+ const at::Tensor &attn_weight,
30
+ const at::Tensor &grad_output,
31
+ const int im2col_step);
32
+
33
+
python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.cu ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #include <vector>
12
+ #include "cuda/ms_deform_im2col_cuda.cuh"
13
+
14
+ #include <ATen/ATen.h>
15
+ #include <ATen/cuda/CUDAContext.h>
16
+ #include <cuda.h>
17
+ #include <cuda_runtime.h>
18
+
19
+
20
+ at::Tensor ms_deform_attn_cuda_forward(
21
+ const at::Tensor &value,
22
+ const at::Tensor &spatial_shapes,
23
+ const at::Tensor &level_start_index,
24
+ const at::Tensor &sampling_loc,
25
+ const at::Tensor &attn_weight,
26
+ const int im2col_step)
27
+ {
28
+ AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
29
+ AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
30
+ AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
31
+ AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
32
+ AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
33
+
34
+ AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
35
+ AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
36
+ AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
37
+ AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
38
+ AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
39
+
40
+ const int batch = value.size(0);
41
+ const int spatial_size = value.size(1);
42
+ const int num_heads = value.size(2);
43
+ const int channels = value.size(3);
44
+
45
+ const int num_levels = spatial_shapes.size(0);
46
+
47
+ const int num_query = sampling_loc.size(1);
48
+ const int num_point = sampling_loc.size(4);
49
+
50
+ const int im2col_step_ = std::min(batch, im2col_step);
51
+
52
+ AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
53
+
54
+ auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
55
+
56
+ const int batch_n = im2col_step_;
57
+ auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
58
+ auto per_value_size = spatial_size * num_heads * channels;
59
+ auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
60
+ auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
61
+ for (int n = 0; n < batch/im2col_step_; ++n)
62
+ {
63
+ auto columns = output_n.select(0, n);
64
+ AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
65
+ ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
66
+ value.data<scalar_t>() + n * im2col_step_ * per_value_size,
67
+ spatial_shapes.data<int64_t>(),
68
+ level_start_index.data<int64_t>(),
69
+ sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
70
+ attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
71
+ batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
72
+ columns.data<scalar_t>());
73
+
74
+ }));
75
+ }
76
+
77
+ output = output.view({batch, num_query, num_heads*channels});
78
+
79
+ return output;
80
+ }
81
+
82
+
83
+ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
84
+ const at::Tensor &value,
85
+ const at::Tensor &spatial_shapes,
86
+ const at::Tensor &level_start_index,
87
+ const at::Tensor &sampling_loc,
88
+ const at::Tensor &attn_weight,
89
+ const at::Tensor &grad_output,
90
+ const int im2col_step)
91
+ {
92
+
93
+ AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
94
+ AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
95
+ AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
96
+ AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
97
+ AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
98
+ AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
99
+
100
+ AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
101
+ AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
102
+ AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
103
+ AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
104
+ AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
105
+ AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
106
+
107
+ const int batch = value.size(0);
108
+ const int spatial_size = value.size(1);
109
+ const int num_heads = value.size(2);
110
+ const int channels = value.size(3);
111
+
112
+ const int num_levels = spatial_shapes.size(0);
113
+
114
+ const int num_query = sampling_loc.size(1);
115
+ const int num_point = sampling_loc.size(4);
116
+
117
+ const int im2col_step_ = std::min(batch, im2col_step);
118
+
119
+ AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
120
+
121
+ auto grad_value = at::zeros_like(value);
122
+ auto grad_sampling_loc = at::zeros_like(sampling_loc);
123
+ auto grad_attn_weight = at::zeros_like(attn_weight);
124
+
125
+ const int batch_n = im2col_step_;
126
+ auto per_value_size = spatial_size * num_heads * channels;
127
+ auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
128
+ auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
129
+ auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
130
+
131
+ for (int n = 0; n < batch/im2col_step_; ++n)
132
+ {
133
+ auto grad_output_g = grad_output_n.select(0, n);
134
+ AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
135
+ ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
136
+ grad_output_g.data<scalar_t>(),
137
+ value.data<scalar_t>() + n * im2col_step_ * per_value_size,
138
+ spatial_shapes.data<int64_t>(),
139
+ level_start_index.data<int64_t>(),
140
+ sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
141
+ attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
142
+ batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
143
+ grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size,
144
+ grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
145
+ grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
146
+
147
+ }));
148
+ }
149
+
150
+ return {
151
+ grad_value, grad_sampling_loc, grad_attn_weight
152
+ };
153
+ }
python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_attn_cuda.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #pragma once
12
+ #include <torch/extension.h>
13
+
14
+ at::Tensor ms_deform_attn_cuda_forward(
15
+ const at::Tensor &value,
16
+ const at::Tensor &spatial_shapes,
17
+ const at::Tensor &level_start_index,
18
+ const at::Tensor &sampling_loc,
19
+ const at::Tensor &attn_weight,
20
+ const int im2col_step);
21
+
22
+ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23
+ const at::Tensor &value,
24
+ const at::Tensor &spatial_shapes,
25
+ const at::Tensor &level_start_index,
26
+ const at::Tensor &sampling_loc,
27
+ const at::Tensor &attn_weight,
28
+ const at::Tensor &grad_output,
29
+ const int im2col_step);
30
+
python/utils/dependencies/XPose/models/UniPose/ops/src/cuda/ms_deform_im2col_cuda.cuh ADDED
@@ -0,0 +1,1327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************
7
+ * Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
8
+ * Copyright (c) 2018 Microsoft
9
+ **************************************************************************
10
+ */
11
+
12
+ #include <cstdio>
13
+ #include <algorithm>
14
+ #include <cstring>
15
+
16
+ #include <ATen/ATen.h>
17
+ #include <ATen/cuda/CUDAContext.h>
18
+
19
+ #include <THC/THCAtomics.cuh>
20
+
21
+ #define CUDA_KERNEL_LOOP(i, n) \
22
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
23
+ i < (n); \
24
+ i += blockDim.x * gridDim.x)
25
+
26
+ const int CUDA_NUM_THREADS = 1024;
27
+ inline int GET_BLOCKS(const int N, const int num_threads)
28
+ {
29
+ return (N + num_threads - 1) / num_threads;
30
+ }
31
+
32
+
33
+ template <typename scalar_t>
34
+ __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
35
+ const int &height, const int &width, const int &nheads, const int &channels,
36
+ const scalar_t &h, const scalar_t &w, const int &m, const int &c)
37
+ {
38
+ const int h_low = floor(h);
39
+ const int w_low = floor(w);
40
+ const int h_high = h_low + 1;
41
+ const int w_high = w_low + 1;
42
+
43
+ const scalar_t lh = h - h_low;
44
+ const scalar_t lw = w - w_low;
45
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
46
+
47
+ const int w_stride = nheads * channels;
48
+ const int h_stride = width * w_stride;
49
+ const int h_low_ptr_offset = h_low * h_stride;
50
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
51
+ const int w_low_ptr_offset = w_low * w_stride;
52
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
53
+ const int base_ptr = m * channels + c;
54
+
55
+ scalar_t v1 = 0;
56
+ if (h_low >= 0 && w_low >= 0)
57
+ {
58
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
59
+ v1 = bottom_data[ptr1];
60
+ }
61
+ scalar_t v2 = 0;
62
+ if (h_low >= 0 && w_high <= width - 1)
63
+ {
64
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
65
+ v2 = bottom_data[ptr2];
66
+ }
67
+ scalar_t v3 = 0;
68
+ if (h_high <= height - 1 && w_low >= 0)
69
+ {
70
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
71
+ v3 = bottom_data[ptr3];
72
+ }
73
+ scalar_t v4 = 0;
74
+ if (h_high <= height - 1 && w_high <= width - 1)
75
+ {
76
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
77
+ v4 = bottom_data[ptr4];
78
+ }
79
+
80
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
81
+
82
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
83
+ return val;
84
+ }
85
+
86
+
87
+ template <typename scalar_t>
88
+ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
89
+ const int &height, const int &width, const int &nheads, const int &channels,
90
+ const scalar_t &h, const scalar_t &w, const int &m, const int &c,
91
+ const scalar_t &top_grad,
92
+ const scalar_t &attn_weight,
93
+ scalar_t* &grad_value,
94
+ scalar_t* grad_sampling_loc,
95
+ scalar_t* grad_attn_weight)
96
+ {
97
+ const int h_low = floor(h);
98
+ const int w_low = floor(w);
99
+ const int h_high = h_low + 1;
100
+ const int w_high = w_low + 1;
101
+
102
+ const scalar_t lh = h - h_low;
103
+ const scalar_t lw = w - w_low;
104
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
105
+
106
+ const int w_stride = nheads * channels;
107
+ const int h_stride = width * w_stride;
108
+ const int h_low_ptr_offset = h_low * h_stride;
109
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
110
+ const int w_low_ptr_offset = w_low * w_stride;
111
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
112
+ const int base_ptr = m * channels + c;
113
+
114
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
115
+ const scalar_t top_grad_value = top_grad * attn_weight;
116
+ scalar_t grad_h_weight = 0, grad_w_weight = 0;
117
+
118
+ scalar_t v1 = 0;
119
+ if (h_low >= 0 && w_low >= 0)
120
+ {
121
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
122
+ v1 = bottom_data[ptr1];
123
+ grad_h_weight -= hw * v1;
124
+ grad_w_weight -= hh * v1;
125
+ atomicAdd(grad_value+ptr1, w1*top_grad_value);
126
+ }
127
+ scalar_t v2 = 0;
128
+ if (h_low >= 0 && w_high <= width - 1)
129
+ {
130
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
131
+ v2 = bottom_data[ptr2];
132
+ grad_h_weight -= lw * v2;
133
+ grad_w_weight += hh * v2;
134
+ atomicAdd(grad_value+ptr2, w2*top_grad_value);
135
+ }
136
+ scalar_t v3 = 0;
137
+ if (h_high <= height - 1 && w_low >= 0)
138
+ {
139
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
140
+ v3 = bottom_data[ptr3];
141
+ grad_h_weight += hw * v3;
142
+ grad_w_weight -= lh * v3;
143
+ atomicAdd(grad_value+ptr3, w3*top_grad_value);
144
+ }
145
+ scalar_t v4 = 0;
146
+ if (h_high <= height - 1 && w_high <= width - 1)
147
+ {
148
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
149
+ v4 = bottom_data[ptr4];
150
+ grad_h_weight += lw * v4;
151
+ grad_w_weight += lh * v4;
152
+ atomicAdd(grad_value+ptr4, w4*top_grad_value);
153
+ }
154
+
155
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
156
+ *grad_attn_weight = top_grad * val;
157
+ *grad_sampling_loc = width * grad_w_weight * top_grad_value;
158
+ *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
159
+ }
160
+
161
+
162
+ template <typename scalar_t>
163
+ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
164
+ const int &height, const int &width, const int &nheads, const int &channels,
165
+ const scalar_t &h, const scalar_t &w, const int &m, const int &c,
166
+ const scalar_t &top_grad,
167
+ const scalar_t &attn_weight,
168
+ scalar_t* &grad_value,
169
+ scalar_t* grad_sampling_loc,
170
+ scalar_t* grad_attn_weight)
171
+ {
172
+ const int h_low = floor(h);
173
+ const int w_low = floor(w);
174
+ const int h_high = h_low + 1;
175
+ const int w_high = w_low + 1;
176
+
177
+ const scalar_t lh = h - h_low;
178
+ const scalar_t lw = w - w_low;
179
+ const scalar_t hh = 1 - lh, hw = 1 - lw;
180
+
181
+ const int w_stride = nheads * channels;
182
+ const int h_stride = width * w_stride;
183
+ const int h_low_ptr_offset = h_low * h_stride;
184
+ const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
185
+ const int w_low_ptr_offset = w_low * w_stride;
186
+ const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
187
+ const int base_ptr = m * channels + c;
188
+
189
+ const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
190
+ const scalar_t top_grad_value = top_grad * attn_weight;
191
+ scalar_t grad_h_weight = 0, grad_w_weight = 0;
192
+
193
+ scalar_t v1 = 0;
194
+ if (h_low >= 0 && w_low >= 0)
195
+ {
196
+ const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
197
+ v1 = bottom_data[ptr1];
198
+ grad_h_weight -= hw * v1;
199
+ grad_w_weight -= hh * v1;
200
+ atomicAdd(grad_value+ptr1, w1*top_grad_value);
201
+ }
202
+ scalar_t v2 = 0;
203
+ if (h_low >= 0 && w_high <= width - 1)
204
+ {
205
+ const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
206
+ v2 = bottom_data[ptr2];
207
+ grad_h_weight -= lw * v2;
208
+ grad_w_weight += hh * v2;
209
+ atomicAdd(grad_value+ptr2, w2*top_grad_value);
210
+ }
211
+ scalar_t v3 = 0;
212
+ if (h_high <= height - 1 && w_low >= 0)
213
+ {
214
+ const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
215
+ v3 = bottom_data[ptr3];
216
+ grad_h_weight += hw * v3;
217
+ grad_w_weight -= lh * v3;
218
+ atomicAdd(grad_value+ptr3, w3*top_grad_value);
219
+ }
220
+ scalar_t v4 = 0;
221
+ if (h_high <= height - 1 && w_high <= width - 1)
222
+ {
223
+ const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
224
+ v4 = bottom_data[ptr4];
225
+ grad_h_weight += lw * v4;
226
+ grad_w_weight += lh * v4;
227
+ atomicAdd(grad_value+ptr4, w4*top_grad_value);
228
+ }
229
+
230
+ const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
231
+ atomicAdd(grad_attn_weight, top_grad * val);
232
+ atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
233
+ atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
234
+ }
235
+
236
+
237
+ template <typename scalar_t>
238
+ __global__ void ms_deformable_im2col_gpu_kernel(const int n,
239
+ const scalar_t *data_value,
240
+ const int64_t *data_spatial_shapes,
241
+ const int64_t *data_level_start_index,
242
+ const scalar_t *data_sampling_loc,
243
+ const scalar_t *data_attn_weight,
244
+ const int batch_size,
245
+ const int spatial_size,
246
+ const int num_heads,
247
+ const int channels,
248
+ const int num_levels,
249
+ const int num_query,
250
+ const int num_point,
251
+ scalar_t *data_col)
252
+ {
253
+ CUDA_KERNEL_LOOP(index, n)
254
+ {
255
+ int _temp = index;
256
+ const int c_col = _temp % channels;
257
+ _temp /= channels;
258
+ const int sampling_index = _temp;
259
+ const int m_col = _temp % num_heads;
260
+ _temp /= num_heads;
261
+ const int q_col = _temp % num_query;
262
+ _temp /= num_query;
263
+ const int b_col = _temp;
264
+
265
+ scalar_t *data_col_ptr = data_col + index;
266
+ int data_weight_ptr = sampling_index * num_levels * num_point;
267
+ int data_loc_w_ptr = data_weight_ptr << 1;
268
+ const int qid_stride = num_heads * channels;
269
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
270
+ scalar_t col = 0;
271
+
272
+ for (int l_col=0; l_col < num_levels; ++l_col)
273
+ {
274
+ const int level_start_id = data_level_start_index[l_col];
275
+ const int spatial_h_ptr = l_col << 1;
276
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
277
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
278
+ const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
279
+ for (int p_col=0; p_col < num_point; ++p_col)
280
+ {
281
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
282
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
283
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
284
+
285
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
286
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
287
+
288
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
289
+ {
290
+ col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
291
+ }
292
+
293
+ data_weight_ptr += 1;
294
+ data_loc_w_ptr += 2;
295
+ }
296
+ }
297
+ *data_col_ptr = col;
298
+ }
299
+ }
300
+
301
+ template <typename scalar_t, unsigned int blockSize>
302
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
303
+ const scalar_t *grad_col,
304
+ const scalar_t *data_value,
305
+ const int64_t *data_spatial_shapes,
306
+ const int64_t *data_level_start_index,
307
+ const scalar_t *data_sampling_loc,
308
+ const scalar_t *data_attn_weight,
309
+ const int batch_size,
310
+ const int spatial_size,
311
+ const int num_heads,
312
+ const int channels,
313
+ const int num_levels,
314
+ const int num_query,
315
+ const int num_point,
316
+ scalar_t *grad_value,
317
+ scalar_t *grad_sampling_loc,
318
+ scalar_t *grad_attn_weight)
319
+ {
320
+ CUDA_KERNEL_LOOP(index, n)
321
+ {
322
+ __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
323
+ __shared__ scalar_t cache_grad_attn_weight[blockSize];
324
+ unsigned int tid = threadIdx.x;
325
+ int _temp = index;
326
+ const int c_col = _temp % channels;
327
+ _temp /= channels;
328
+ const int sampling_index = _temp;
329
+ const int m_col = _temp % num_heads;
330
+ _temp /= num_heads;
331
+ const int q_col = _temp % num_query;
332
+ _temp /= num_query;
333
+ const int b_col = _temp;
334
+
335
+ const scalar_t top_grad = grad_col[index];
336
+
337
+ int data_weight_ptr = sampling_index * num_levels * num_point;
338
+ int data_loc_w_ptr = data_weight_ptr << 1;
339
+ const int grad_sampling_ptr = data_weight_ptr;
340
+ grad_sampling_loc += grad_sampling_ptr << 1;
341
+ grad_attn_weight += grad_sampling_ptr;
342
+ const int grad_weight_stride = 1;
343
+ const int grad_loc_stride = 2;
344
+ const int qid_stride = num_heads * channels;
345
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
346
+
347
+ for (int l_col=0; l_col < num_levels; ++l_col)
348
+ {
349
+ const int level_start_id = data_level_start_index[l_col];
350
+ const int spatial_h_ptr = l_col << 1;
351
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
352
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
353
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
354
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
355
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
356
+
357
+ for (int p_col=0; p_col < num_point; ++p_col)
358
+ {
359
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
360
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
361
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
362
+
363
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
364
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
365
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
366
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
367
+ *(cache_grad_attn_weight+threadIdx.x)=0;
368
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
369
+ {
370
+ ms_deform_attn_col2im_bilinear(
371
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
372
+ top_grad, weight, grad_value_ptr,
373
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
374
+ }
375
+
376
+ __syncthreads();
377
+ if (tid == 0)
378
+ {
379
+ scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
380
+ int sid=2;
381
+ for (unsigned int tid = 1; tid < blockSize; ++tid)
382
+ {
383
+ _grad_w += cache_grad_sampling_loc[sid];
384
+ _grad_h += cache_grad_sampling_loc[sid + 1];
385
+ _grad_a += cache_grad_attn_weight[tid];
386
+ sid += 2;
387
+ }
388
+
389
+
390
+ *grad_sampling_loc = _grad_w;
391
+ *(grad_sampling_loc + 1) = _grad_h;
392
+ *grad_attn_weight = _grad_a;
393
+ }
394
+ __syncthreads();
395
+
396
+ data_weight_ptr += 1;
397
+ data_loc_w_ptr += 2;
398
+ grad_attn_weight += grad_weight_stride;
399
+ grad_sampling_loc += grad_loc_stride;
400
+ }
401
+ }
402
+ }
403
+ }
404
+
405
+
406
+ template <typename scalar_t, unsigned int blockSize>
407
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
408
+ const scalar_t *grad_col,
409
+ const scalar_t *data_value,
410
+ const int64_t *data_spatial_shapes,
411
+ const int64_t *data_level_start_index,
412
+ const scalar_t *data_sampling_loc,
413
+ const scalar_t *data_attn_weight,
414
+ const int batch_size,
415
+ const int spatial_size,
416
+ const int num_heads,
417
+ const int channels,
418
+ const int num_levels,
419
+ const int num_query,
420
+ const int num_point,
421
+ scalar_t *grad_value,
422
+ scalar_t *grad_sampling_loc,
423
+ scalar_t *grad_attn_weight)
424
+ {
425
+ CUDA_KERNEL_LOOP(index, n)
426
+ {
427
+ __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
428
+ __shared__ scalar_t cache_grad_attn_weight[blockSize];
429
+ unsigned int tid = threadIdx.x;
430
+ int _temp = index;
431
+ const int c_col = _temp % channels;
432
+ _temp /= channels;
433
+ const int sampling_index = _temp;
434
+ const int m_col = _temp % num_heads;
435
+ _temp /= num_heads;
436
+ const int q_col = _temp % num_query;
437
+ _temp /= num_query;
438
+ const int b_col = _temp;
439
+
440
+ const scalar_t top_grad = grad_col[index];
441
+
442
+ int data_weight_ptr = sampling_index * num_levels * num_point;
443
+ int data_loc_w_ptr = data_weight_ptr << 1;
444
+ const int grad_sampling_ptr = data_weight_ptr;
445
+ grad_sampling_loc += grad_sampling_ptr << 1;
446
+ grad_attn_weight += grad_sampling_ptr;
447
+ const int grad_weight_stride = 1;
448
+ const int grad_loc_stride = 2;
449
+ const int qid_stride = num_heads * channels;
450
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
451
+
452
+ for (int l_col=0; l_col < num_levels; ++l_col)
453
+ {
454
+ const int level_start_id = data_level_start_index[l_col];
455
+ const int spatial_h_ptr = l_col << 1;
456
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
457
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
458
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
459
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
460
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
461
+
462
+ for (int p_col=0; p_col < num_point; ++p_col)
463
+ {
464
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
465
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
466
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
467
+
468
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
469
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
470
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
471
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
472
+ *(cache_grad_attn_weight+threadIdx.x)=0;
473
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
474
+ {
475
+ ms_deform_attn_col2im_bilinear(
476
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
477
+ top_grad, weight, grad_value_ptr,
478
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
479
+ }
480
+
481
+ __syncthreads();
482
+
483
+ for (unsigned int s=blockSize/2; s>0; s>>=1)
484
+ {
485
+ if (tid < s) {
486
+ const unsigned int xid1 = tid << 1;
487
+ const unsigned int xid2 = (tid + s) << 1;
488
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
489
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
490
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
491
+ }
492
+ __syncthreads();
493
+ }
494
+
495
+ if (tid == 0)
496
+ {
497
+ *grad_sampling_loc = cache_grad_sampling_loc[0];
498
+ *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
499
+ *grad_attn_weight = cache_grad_attn_weight[0];
500
+ }
501
+ __syncthreads();
502
+
503
+ data_weight_ptr += 1;
504
+ data_loc_w_ptr += 2;
505
+ grad_attn_weight += grad_weight_stride;
506
+ grad_sampling_loc += grad_loc_stride;
507
+ }
508
+ }
509
+ }
510
+ }
511
+
512
+
513
+ template <typename scalar_t>
514
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
515
+ const scalar_t *grad_col,
516
+ const scalar_t *data_value,
517
+ const int64_t *data_spatial_shapes,
518
+ const int64_t *data_level_start_index,
519
+ const scalar_t *data_sampling_loc,
520
+ const scalar_t *data_attn_weight,
521
+ const int batch_size,
522
+ const int spatial_size,
523
+ const int num_heads,
524
+ const int channels,
525
+ const int num_levels,
526
+ const int num_query,
527
+ const int num_point,
528
+ scalar_t *grad_value,
529
+ scalar_t *grad_sampling_loc,
530
+ scalar_t *grad_attn_weight)
531
+ {
532
+ CUDA_KERNEL_LOOP(index, n)
533
+ {
534
+ extern __shared__ int _s[];
535
+ scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
536
+ scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
537
+ unsigned int tid = threadIdx.x;
538
+ int _temp = index;
539
+ const int c_col = _temp % channels;
540
+ _temp /= channels;
541
+ const int sampling_index = _temp;
542
+ const int m_col = _temp % num_heads;
543
+ _temp /= num_heads;
544
+ const int q_col = _temp % num_query;
545
+ _temp /= num_query;
546
+ const int b_col = _temp;
547
+
548
+ const scalar_t top_grad = grad_col[index];
549
+
550
+ int data_weight_ptr = sampling_index * num_levels * num_point;
551
+ int data_loc_w_ptr = data_weight_ptr << 1;
552
+ const int grad_sampling_ptr = data_weight_ptr;
553
+ grad_sampling_loc += grad_sampling_ptr << 1;
554
+ grad_attn_weight += grad_sampling_ptr;
555
+ const int grad_weight_stride = 1;
556
+ const int grad_loc_stride = 2;
557
+ const int qid_stride = num_heads * channels;
558
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
559
+
560
+ for (int l_col=0; l_col < num_levels; ++l_col)
561
+ {
562
+ const int level_start_id = data_level_start_index[l_col];
563
+ const int spatial_h_ptr = l_col << 1;
564
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
565
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
566
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
567
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
568
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
569
+
570
+ for (int p_col=0; p_col < num_point; ++p_col)
571
+ {
572
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
573
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
574
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
575
+
576
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
577
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
578
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
579
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
580
+ *(cache_grad_attn_weight+threadIdx.x)=0;
581
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
582
+ {
583
+ ms_deform_attn_col2im_bilinear(
584
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
585
+ top_grad, weight, grad_value_ptr,
586
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
587
+ }
588
+
589
+ __syncthreads();
590
+ if (tid == 0)
591
+ {
592
+ scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
593
+ int sid=2;
594
+ for (unsigned int tid = 1; tid < blockDim.x; ++tid)
595
+ {
596
+ _grad_w += cache_grad_sampling_loc[sid];
597
+ _grad_h += cache_grad_sampling_loc[sid + 1];
598
+ _grad_a += cache_grad_attn_weight[tid];
599
+ sid += 2;
600
+ }
601
+
602
+
603
+ *grad_sampling_loc = _grad_w;
604
+ *(grad_sampling_loc + 1) = _grad_h;
605
+ *grad_attn_weight = _grad_a;
606
+ }
607
+ __syncthreads();
608
+
609
+ data_weight_ptr += 1;
610
+ data_loc_w_ptr += 2;
611
+ grad_attn_weight += grad_weight_stride;
612
+ grad_sampling_loc += grad_loc_stride;
613
+ }
614
+ }
615
+ }
616
+ }
617
+
618
+ template <typename scalar_t>
619
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
620
+ const scalar_t *grad_col,
621
+ const scalar_t *data_value,
622
+ const int64_t *data_spatial_shapes,
623
+ const int64_t *data_level_start_index,
624
+ const scalar_t *data_sampling_loc,
625
+ const scalar_t *data_attn_weight,
626
+ const int batch_size,
627
+ const int spatial_size,
628
+ const int num_heads,
629
+ const int channels,
630
+ const int num_levels,
631
+ const int num_query,
632
+ const int num_point,
633
+ scalar_t *grad_value,
634
+ scalar_t *grad_sampling_loc,
635
+ scalar_t *grad_attn_weight)
636
+ {
637
+ CUDA_KERNEL_LOOP(index, n)
638
+ {
639
+ extern __shared__ int _s[];
640
+ scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
641
+ scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
642
+ unsigned int tid = threadIdx.x;
643
+ int _temp = index;
644
+ const int c_col = _temp % channels;
645
+ _temp /= channels;
646
+ const int sampling_index = _temp;
647
+ const int m_col = _temp % num_heads;
648
+ _temp /= num_heads;
649
+ const int q_col = _temp % num_query;
650
+ _temp /= num_query;
651
+ const int b_col = _temp;
652
+
653
+ const scalar_t top_grad = grad_col[index];
654
+
655
+ int data_weight_ptr = sampling_index * num_levels * num_point;
656
+ int data_loc_w_ptr = data_weight_ptr << 1;
657
+ const int grad_sampling_ptr = data_weight_ptr;
658
+ grad_sampling_loc += grad_sampling_ptr << 1;
659
+ grad_attn_weight += grad_sampling_ptr;
660
+ const int grad_weight_stride = 1;
661
+ const int grad_loc_stride = 2;
662
+ const int qid_stride = num_heads * channels;
663
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
664
+
665
+ for (int l_col=0; l_col < num_levels; ++l_col)
666
+ {
667
+ const int level_start_id = data_level_start_index[l_col];
668
+ const int spatial_h_ptr = l_col << 1;
669
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
670
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
671
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
672
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
673
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
674
+
675
+ for (int p_col=0; p_col < num_point; ++p_col)
676
+ {
677
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
678
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
679
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
680
+
681
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
682
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
683
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
684
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
685
+ *(cache_grad_attn_weight+threadIdx.x)=0;
686
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
687
+ {
688
+ ms_deform_attn_col2im_bilinear(
689
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
690
+ top_grad, weight, grad_value_ptr,
691
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
692
+ }
693
+
694
+ __syncthreads();
695
+
696
+ for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
697
+ {
698
+ if (tid < s) {
699
+ const unsigned int xid1 = tid << 1;
700
+ const unsigned int xid2 = (tid + s) << 1;
701
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
702
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
703
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
704
+ if (tid + (s << 1) < spre)
705
+ {
706
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
707
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
708
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
709
+ }
710
+ }
711
+ __syncthreads();
712
+ }
713
+
714
+ if (tid == 0)
715
+ {
716
+ *grad_sampling_loc = cache_grad_sampling_loc[0];
717
+ *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
718
+ *grad_attn_weight = cache_grad_attn_weight[0];
719
+ }
720
+ __syncthreads();
721
+
722
+ data_weight_ptr += 1;
723
+ data_loc_w_ptr += 2;
724
+ grad_attn_weight += grad_weight_stride;
725
+ grad_sampling_loc += grad_loc_stride;
726
+ }
727
+ }
728
+ }
729
+ }
730
+
731
+ template <typename scalar_t>
732
+ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
733
+ const scalar_t *grad_col,
734
+ const scalar_t *data_value,
735
+ const int64_t *data_spatial_shapes,
736
+ const int64_t *data_level_start_index,
737
+ const scalar_t *data_sampling_loc,
738
+ const scalar_t *data_attn_weight,
739
+ const int batch_size,
740
+ const int spatial_size,
741
+ const int num_heads,
742
+ const int channels,
743
+ const int num_levels,
744
+ const int num_query,
745
+ const int num_point,
746
+ scalar_t *grad_value,
747
+ scalar_t *grad_sampling_loc,
748
+ scalar_t *grad_attn_weight)
749
+ {
750
+ CUDA_KERNEL_LOOP(index, n)
751
+ {
752
+ extern __shared__ int _s[];
753
+ scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
754
+ scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
755
+ unsigned int tid = threadIdx.x;
756
+ int _temp = index;
757
+ const int c_col = _temp % channels;
758
+ _temp /= channels;
759
+ const int sampling_index = _temp;
760
+ const int m_col = _temp % num_heads;
761
+ _temp /= num_heads;
762
+ const int q_col = _temp % num_query;
763
+ _temp /= num_query;
764
+ const int b_col = _temp;
765
+
766
+ const scalar_t top_grad = grad_col[index];
767
+
768
+ int data_weight_ptr = sampling_index * num_levels * num_point;
769
+ int data_loc_w_ptr = data_weight_ptr << 1;
770
+ const int grad_sampling_ptr = data_weight_ptr;
771
+ grad_sampling_loc += grad_sampling_ptr << 1;
772
+ grad_attn_weight += grad_sampling_ptr;
773
+ const int grad_weight_stride = 1;
774
+ const int grad_loc_stride = 2;
775
+ const int qid_stride = num_heads * channels;
776
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
777
+
778
+ for (int l_col=0; l_col < num_levels; ++l_col)
779
+ {
780
+ const int level_start_id = data_level_start_index[l_col];
781
+ const int spatial_h_ptr = l_col << 1;
782
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
783
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
784
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
785
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
786
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
787
+
788
+ for (int p_col=0; p_col < num_point; ++p_col)
789
+ {
790
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
791
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
792
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
793
+
794
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
795
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
796
+ *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
797
+ *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
798
+ *(cache_grad_attn_weight+threadIdx.x)=0;
799
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
800
+ {
801
+ ms_deform_attn_col2im_bilinear(
802
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
803
+ top_grad, weight, grad_value_ptr,
804
+ cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
805
+ }
806
+
807
+ __syncthreads();
808
+
809
+ for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
810
+ {
811
+ if (tid < s) {
812
+ const unsigned int xid1 = tid << 1;
813
+ const unsigned int xid2 = (tid + s) << 1;
814
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
815
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
816
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
817
+ if (tid + (s << 1) < spre)
818
+ {
819
+ cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
820
+ cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
821
+ cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
822
+ }
823
+ }
824
+ __syncthreads();
825
+ }
826
+
827
+ if (tid == 0)
828
+ {
829
+ atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
830
+ atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
831
+ atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
832
+ }
833
+ __syncthreads();
834
+
835
+ data_weight_ptr += 1;
836
+ data_loc_w_ptr += 2;
837
+ grad_attn_weight += grad_weight_stride;
838
+ grad_sampling_loc += grad_loc_stride;
839
+ }
840
+ }
841
+ }
842
+ }
843
+
844
+
845
+ template <typename scalar_t>
846
+ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
847
+ const scalar_t *grad_col,
848
+ const scalar_t *data_value,
849
+ const int64_t *data_spatial_shapes,
850
+ const int64_t *data_level_start_index,
851
+ const scalar_t *data_sampling_loc,
852
+ const scalar_t *data_attn_weight,
853
+ const int batch_size,
854
+ const int spatial_size,
855
+ const int num_heads,
856
+ const int channels,
857
+ const int num_levels,
858
+ const int num_query,
859
+ const int num_point,
860
+ scalar_t *grad_value,
861
+ scalar_t *grad_sampling_loc,
862
+ scalar_t *grad_attn_weight)
863
+ {
864
+ CUDA_KERNEL_LOOP(index, n)
865
+ {
866
+ int _temp = index;
867
+ const int c_col = _temp % channels;
868
+ _temp /= channels;
869
+ const int sampling_index = _temp;
870
+ const int m_col = _temp % num_heads;
871
+ _temp /= num_heads;
872
+ const int q_col = _temp % num_query;
873
+ _temp /= num_query;
874
+ const int b_col = _temp;
875
+
876
+ const scalar_t top_grad = grad_col[index];
877
+
878
+ int data_weight_ptr = sampling_index * num_levels * num_point;
879
+ int data_loc_w_ptr = data_weight_ptr << 1;
880
+ const int grad_sampling_ptr = data_weight_ptr;
881
+ grad_sampling_loc += grad_sampling_ptr << 1;
882
+ grad_attn_weight += grad_sampling_ptr;
883
+ const int grad_weight_stride = 1;
884
+ const int grad_loc_stride = 2;
885
+ const int qid_stride = num_heads * channels;
886
+ const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
887
+
888
+ for (int l_col=0; l_col < num_levels; ++l_col)
889
+ {
890
+ const int level_start_id = data_level_start_index[l_col];
891
+ const int spatial_h_ptr = l_col << 1;
892
+ const int spatial_h = data_spatial_shapes[spatial_h_ptr];
893
+ const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
894
+ const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
895
+ const scalar_t *data_value_ptr = data_value + value_ptr_offset;
896
+ scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
897
+
898
+ for (int p_col=0; p_col < num_point; ++p_col)
899
+ {
900
+ const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
901
+ const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
902
+ const scalar_t weight = data_attn_weight[data_weight_ptr];
903
+
904
+ const scalar_t h_im = loc_h * spatial_h - 0.5;
905
+ const scalar_t w_im = loc_w * spatial_w - 0.5;
906
+ if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
907
+ {
908
+ ms_deform_attn_col2im_bilinear_gm(
909
+ data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
910
+ top_grad, weight, grad_value_ptr,
911
+ grad_sampling_loc, grad_attn_weight);
912
+ }
913
+ data_weight_ptr += 1;
914
+ data_loc_w_ptr += 2;
915
+ grad_attn_weight += grad_weight_stride;
916
+ grad_sampling_loc += grad_loc_stride;
917
+ }
918
+ }
919
+ }
920
+ }
921
+
922
+
923
+ template <typename scalar_t>
924
+ void ms_deformable_im2col_cuda(cudaStream_t stream,
925
+ const scalar_t* data_value,
926
+ const int64_t* data_spatial_shapes,
927
+ const int64_t* data_level_start_index,
928
+ const scalar_t* data_sampling_loc,
929
+ const scalar_t* data_attn_weight,
930
+ const int batch_size,
931
+ const int spatial_size,
932
+ const int num_heads,
933
+ const int channels,
934
+ const int num_levels,
935
+ const int num_query,
936
+ const int num_point,
937
+ scalar_t* data_col)
938
+ {
939
+ const int num_kernels = batch_size * num_query * num_heads * channels;
940
+ const int num_actual_kernels = batch_size * num_query * num_heads * channels;
941
+ const int num_threads = CUDA_NUM_THREADS;
942
+ ms_deformable_im2col_gpu_kernel<scalar_t>
943
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
944
+ 0, stream>>>(
945
+ num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
946
+ batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
947
+
948
+ cudaError_t err = cudaGetLastError();
949
+ if (err != cudaSuccess)
950
+ {
951
+ printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
952
+ }
953
+
954
+ }
955
+
956
+ template <typename scalar_t>
957
+ void ms_deformable_col2im_cuda(cudaStream_t stream,
958
+ const scalar_t* grad_col,
959
+ const scalar_t* data_value,
960
+ const int64_t * data_spatial_shapes,
961
+ const int64_t * data_level_start_index,
962
+ const scalar_t * data_sampling_loc,
963
+ const scalar_t * data_attn_weight,
964
+ const int batch_size,
965
+ const int spatial_size,
966
+ const int num_heads,
967
+ const int channels,
968
+ const int num_levels,
969
+ const int num_query,
970
+ const int num_point,
971
+ scalar_t* grad_value,
972
+ scalar_t* grad_sampling_loc,
973
+ scalar_t* grad_attn_weight)
974
+ {
975
+ const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
976
+ const int num_kernels = batch_size * num_query * num_heads * channels;
977
+ const int num_actual_kernels = batch_size * num_query * num_heads * channels;
978
+ if (channels > 1024)
979
+ {
980
+ if ((channels & 1023) == 0)
981
+ {
982
+ ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
983
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
984
+ num_threads*3*sizeof(scalar_t), stream>>>(
985
+ num_kernels,
986
+ grad_col,
987
+ data_value,
988
+ data_spatial_shapes,
989
+ data_level_start_index,
990
+ data_sampling_loc,
991
+ data_attn_weight,
992
+ batch_size,
993
+ spatial_size,
994
+ num_heads,
995
+ channels,
996
+ num_levels,
997
+ num_query,
998
+ num_point,
999
+ grad_value,
1000
+ grad_sampling_loc,
1001
+ grad_attn_weight);
1002
+ }
1003
+ else
1004
+ {
1005
+ ms_deformable_col2im_gpu_kernel_gm<scalar_t>
1006
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1007
+ 0, stream>>>(
1008
+ num_kernels,
1009
+ grad_col,
1010
+ data_value,
1011
+ data_spatial_shapes,
1012
+ data_level_start_index,
1013
+ data_sampling_loc,
1014
+ data_attn_weight,
1015
+ batch_size,
1016
+ spatial_size,
1017
+ num_heads,
1018
+ channels,
1019
+ num_levels,
1020
+ num_query,
1021
+ num_point,
1022
+ grad_value,
1023
+ grad_sampling_loc,
1024
+ grad_attn_weight);
1025
+ }
1026
+ }
1027
+ else{
1028
+ switch(channels)
1029
+ {
1030
+ case 1:
1031
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
1032
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1033
+ 0, stream>>>(
1034
+ num_kernels,
1035
+ grad_col,
1036
+ data_value,
1037
+ data_spatial_shapes,
1038
+ data_level_start_index,
1039
+ data_sampling_loc,
1040
+ data_attn_weight,
1041
+ batch_size,
1042
+ spatial_size,
1043
+ num_heads,
1044
+ channels,
1045
+ num_levels,
1046
+ num_query,
1047
+ num_point,
1048
+ grad_value,
1049
+ grad_sampling_loc,
1050
+ grad_attn_weight);
1051
+ break;
1052
+ case 2:
1053
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
1054
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1055
+ 0, stream>>>(
1056
+ num_kernels,
1057
+ grad_col,
1058
+ data_value,
1059
+ data_spatial_shapes,
1060
+ data_level_start_index,
1061
+ data_sampling_loc,
1062
+ data_attn_weight,
1063
+ batch_size,
1064
+ spatial_size,
1065
+ num_heads,
1066
+ channels,
1067
+ num_levels,
1068
+ num_query,
1069
+ num_point,
1070
+ grad_value,
1071
+ grad_sampling_loc,
1072
+ grad_attn_weight);
1073
+ break;
1074
+ case 4:
1075
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
1076
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1077
+ 0, stream>>>(
1078
+ num_kernels,
1079
+ grad_col,
1080
+ data_value,
1081
+ data_spatial_shapes,
1082
+ data_level_start_index,
1083
+ data_sampling_loc,
1084
+ data_attn_weight,
1085
+ batch_size,
1086
+ spatial_size,
1087
+ num_heads,
1088
+ channels,
1089
+ num_levels,
1090
+ num_query,
1091
+ num_point,
1092
+ grad_value,
1093
+ grad_sampling_loc,
1094
+ grad_attn_weight);
1095
+ break;
1096
+ case 8:
1097
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
1098
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1099
+ 0, stream>>>(
1100
+ num_kernels,
1101
+ grad_col,
1102
+ data_value,
1103
+ data_spatial_shapes,
1104
+ data_level_start_index,
1105
+ data_sampling_loc,
1106
+ data_attn_weight,
1107
+ batch_size,
1108
+ spatial_size,
1109
+ num_heads,
1110
+ channels,
1111
+ num_levels,
1112
+ num_query,
1113
+ num_point,
1114
+ grad_value,
1115
+ grad_sampling_loc,
1116
+ grad_attn_weight);
1117
+ break;
1118
+ case 16:
1119
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
1120
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1121
+ 0, stream>>>(
1122
+ num_kernels,
1123
+ grad_col,
1124
+ data_value,
1125
+ data_spatial_shapes,
1126
+ data_level_start_index,
1127
+ data_sampling_loc,
1128
+ data_attn_weight,
1129
+ batch_size,
1130
+ spatial_size,
1131
+ num_heads,
1132
+ channels,
1133
+ num_levels,
1134
+ num_query,
1135
+ num_point,
1136
+ grad_value,
1137
+ grad_sampling_loc,
1138
+ grad_attn_weight);
1139
+ break;
1140
+ case 32:
1141
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
1142
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1143
+ 0, stream>>>(
1144
+ num_kernels,
1145
+ grad_col,
1146
+ data_value,
1147
+ data_spatial_shapes,
1148
+ data_level_start_index,
1149
+ data_sampling_loc,
1150
+ data_attn_weight,
1151
+ batch_size,
1152
+ spatial_size,
1153
+ num_heads,
1154
+ channels,
1155
+ num_levels,
1156
+ num_query,
1157
+ num_point,
1158
+ grad_value,
1159
+ grad_sampling_loc,
1160
+ grad_attn_weight);
1161
+ break;
1162
+ case 64:
1163
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
1164
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1165
+ 0, stream>>>(
1166
+ num_kernels,
1167
+ grad_col,
1168
+ data_value,
1169
+ data_spatial_shapes,
1170
+ data_level_start_index,
1171
+ data_sampling_loc,
1172
+ data_attn_weight,
1173
+ batch_size,
1174
+ spatial_size,
1175
+ num_heads,
1176
+ channels,
1177
+ num_levels,
1178
+ num_query,
1179
+ num_point,
1180
+ grad_value,
1181
+ grad_sampling_loc,
1182
+ grad_attn_weight);
1183
+ break;
1184
+ case 128:
1185
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
1186
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1187
+ 0, stream>>>(
1188
+ num_kernels,
1189
+ grad_col,
1190
+ data_value,
1191
+ data_spatial_shapes,
1192
+ data_level_start_index,
1193
+ data_sampling_loc,
1194
+ data_attn_weight,
1195
+ batch_size,
1196
+ spatial_size,
1197
+ num_heads,
1198
+ channels,
1199
+ num_levels,
1200
+ num_query,
1201
+ num_point,
1202
+ grad_value,
1203
+ grad_sampling_loc,
1204
+ grad_attn_weight);
1205
+ break;
1206
+ case 256:
1207
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
1208
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1209
+ 0, stream>>>(
1210
+ num_kernels,
1211
+ grad_col,
1212
+ data_value,
1213
+ data_spatial_shapes,
1214
+ data_level_start_index,
1215
+ data_sampling_loc,
1216
+ data_attn_weight,
1217
+ batch_size,
1218
+ spatial_size,
1219
+ num_heads,
1220
+ channels,
1221
+ num_levels,
1222
+ num_query,
1223
+ num_point,
1224
+ grad_value,
1225
+ grad_sampling_loc,
1226
+ grad_attn_weight);
1227
+ break;
1228
+ case 512:
1229
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
1230
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1231
+ 0, stream>>>(
1232
+ num_kernels,
1233
+ grad_col,
1234
+ data_value,
1235
+ data_spatial_shapes,
1236
+ data_level_start_index,
1237
+ data_sampling_loc,
1238
+ data_attn_weight,
1239
+ batch_size,
1240
+ spatial_size,
1241
+ num_heads,
1242
+ channels,
1243
+ num_levels,
1244
+ num_query,
1245
+ num_point,
1246
+ grad_value,
1247
+ grad_sampling_loc,
1248
+ grad_attn_weight);
1249
+ break;
1250
+ case 1024:
1251
+ ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
1252
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1253
+ 0, stream>>>(
1254
+ num_kernels,
1255
+ grad_col,
1256
+ data_value,
1257
+ data_spatial_shapes,
1258
+ data_level_start_index,
1259
+ data_sampling_loc,
1260
+ data_attn_weight,
1261
+ batch_size,
1262
+ spatial_size,
1263
+ num_heads,
1264
+ channels,
1265
+ num_levels,
1266
+ num_query,
1267
+ num_point,
1268
+ grad_value,
1269
+ grad_sampling_loc,
1270
+ grad_attn_weight);
1271
+ break;
1272
+ default:
1273
+ if (channels < 64)
1274
+ {
1275
+ ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
1276
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1277
+ num_threads*3*sizeof(scalar_t), stream>>>(
1278
+ num_kernels,
1279
+ grad_col,
1280
+ data_value,
1281
+ data_spatial_shapes,
1282
+ data_level_start_index,
1283
+ data_sampling_loc,
1284
+ data_attn_weight,
1285
+ batch_size,
1286
+ spatial_size,
1287
+ num_heads,
1288
+ channels,
1289
+ num_levels,
1290
+ num_query,
1291
+ num_point,
1292
+ grad_value,
1293
+ grad_sampling_loc,
1294
+ grad_attn_weight);
1295
+ }
1296
+ else
1297
+ {
1298
+ ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
1299
+ <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
1300
+ num_threads*3*sizeof(scalar_t), stream>>>(
1301
+ num_kernels,
1302
+ grad_col,
1303
+ data_value,
1304
+ data_spatial_shapes,
1305
+ data_level_start_index,
1306
+ data_sampling_loc,
1307
+ data_attn_weight,
1308
+ batch_size,
1309
+ spatial_size,
1310
+ num_heads,
1311
+ channels,
1312
+ num_levels,
1313
+ num_query,
1314
+ num_point,
1315
+ grad_value,
1316
+ grad_sampling_loc,
1317
+ grad_attn_weight);
1318
+ }
1319
+ }
1320
+ }
1321
+ cudaError_t err = cudaGetLastError();
1322
+ if (err != cudaSuccess)
1323
+ {
1324
+ printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
1325
+ }
1326
+
1327
+ }
python/utils/dependencies/XPose/models/UniPose/ops/src/ms_deform_attn.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #pragma once
12
+
13
+ #include "cpu/ms_deform_attn_cpu.h"
14
+
15
+ #ifdef WITH_CUDA
16
+ #include "cuda/ms_deform_attn_cuda.h"
17
+ #endif
18
+
19
+
20
+ at::Tensor
21
+ ms_deform_attn_forward(
22
+ const at::Tensor &value,
23
+ const at::Tensor &spatial_shapes,
24
+ const at::Tensor &level_start_index,
25
+ const at::Tensor &sampling_loc,
26
+ const at::Tensor &attn_weight,
27
+ const int im2col_step)
28
+ {
29
+ if (value.type().is_cuda())
30
+ {
31
+ #ifdef WITH_CUDA
32
+ return ms_deform_attn_cuda_forward(
33
+ value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34
+ #else
35
+ AT_ERROR("Not compiled with GPU support");
36
+ #endif
37
+ }
38
+ AT_ERROR("Not implemented on the CPU");
39
+ }
40
+
41
+ std::vector<at::Tensor>
42
+ ms_deform_attn_backward(
43
+ const at::Tensor &value,
44
+ const at::Tensor &spatial_shapes,
45
+ const at::Tensor &level_start_index,
46
+ const at::Tensor &sampling_loc,
47
+ const at::Tensor &attn_weight,
48
+ const at::Tensor &grad_output,
49
+ const int im2col_step)
50
+ {
51
+ if (value.type().is_cuda())
52
+ {
53
+ #ifdef WITH_CUDA
54
+ return ms_deform_attn_cuda_backward(
55
+ value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56
+ #else
57
+ AT_ERROR("Not compiled with GPU support");
58
+ #endif
59
+ }
60
+ AT_ERROR("Not implemented on the CPU");
61
+ }
62
+
python/utils/dependencies/XPose/models/UniPose/ops/src/vision.cpp ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ **************************************************************************************************
3
+ * Deformable DETR
4
+ * Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ **************************************************************************************************
7
+ * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ **************************************************************************************************
9
+ */
10
+
11
+ #include "ms_deform_attn.h"
12
+
13
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14
+ m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15
+ m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16
+ }
python/utils/dependencies/XPose/models/UniPose/ops/test.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from __future__ import absolute_import
10
+ from __future__ import print_function
11
+ from __future__ import division
12
+
13
+ import time
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.autograd import gradcheck
17
+
18
+ from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19
+
20
+
21
+ N, M, D = 1, 2, 2
22
+ Lq, L, P = 2, 2, 2
23
+ shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24
+ level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25
+ S = sum([(H*W).item() for H, W in shapes])
26
+
27
+
28
+ torch.manual_seed(3)
29
+
30
+
31
+ @torch.no_grad()
32
+ def check_forward_equal_with_pytorch_double():
33
+ value = torch.rand(N, S, M, D).cuda() * 0.01
34
+ sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35
+ attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36
+ attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37
+ im2col_step = 2
38
+ output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39
+ output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40
+ fwdok = torch.allclose(output_cuda, output_pytorch)
41
+ max_abs_err = (output_cuda - output_pytorch).abs().max()
42
+ max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43
+
44
+ print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45
+
46
+
47
+ @torch.no_grad()
48
+ def check_forward_equal_with_pytorch_float():
49
+ value = torch.rand(N, S, M, D).cuda() * 0.01
50
+ sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51
+ attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52
+ attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53
+ im2col_step = 2
54
+ output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55
+ output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56
+ fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57
+ max_abs_err = (output_cuda - output_pytorch).abs().max()
58
+ max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59
+
60
+ print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61
+
62
+
63
+ def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64
+
65
+ value = torch.rand(N, S, M, channels).cuda() * 0.01
66
+ sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67
+ attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68
+ attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69
+ im2col_step = 2
70
+ func = MSDeformAttnFunction.apply
71
+
72
+ value.requires_grad = grad_value
73
+ sampling_locations.requires_grad = grad_sampling_loc
74
+ attention_weights.requires_grad = grad_attn_weight
75
+
76
+ gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77
+
78
+ print(f'* {gradok} check_gradient_numerical(D={channels})')
79
+
80
+
81
+ if __name__ == '__main__':
82
+ check_forward_equal_with_pytorch_double()
83
+ check_forward_equal_with_pytorch_float()
84
+
85
+ for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86
+ check_gradient_numerical(channels, True, True, True)
87
+
88
+
89
+
python/utils/dependencies/XPose/models/UniPose/position_encoding.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Conditional DETR
7
+ # Copyright (c) 2021 Microsoft. All Rights Reserved.
8
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
9
+ # ------------------------------------------------------------------------
10
+ # Copied from DETR (https://github.com/facebookresearch/detr)
11
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
12
+ # ------------------------------------------------------------------------
13
+
14
+ """
15
+ Various positional encodings for the transformer.
16
+ """
17
+ import math
18
+ import torch
19
+ from torch import nn
20
+
21
+ from util.misc import NestedTensor
22
+
23
+
24
+ class PositionEmbeddingSine(nn.Module):
25
+ """
26
+ This is a more standard version of the position embedding, very similar to the one
27
+ used by the Attention is all you need paper, generalized to work on images.
28
+ """
29
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
30
+ super().__init__()
31
+ self.num_pos_feats = num_pos_feats
32
+ self.temperature = temperature
33
+ self.normalize = normalize
34
+ if scale is not None and normalize is False:
35
+ raise ValueError("normalize should be True if scale is passed")
36
+ if scale is None:
37
+ scale = 2 * math.pi
38
+ self.scale = scale
39
+
40
+ def forward(self, tensor_list: NestedTensor):
41
+ x = tensor_list.tensors
42
+ mask = tensor_list.mask
43
+ assert mask is not None
44
+ not_mask = ~mask
45
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
46
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
47
+ if self.normalize:
48
+ eps = 1e-6
49
+ # if os.environ.get("SHILONG_AMP", None) == '1':
50
+ # eps = 1e-4
51
+ # else:
52
+ # eps = 1e-6
53
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
54
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
55
+
56
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
57
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
58
+
59
+ pos_x = x_embed[:, :, :, None] / dim_t
60
+ pos_y = y_embed[:, :, :, None] / dim_t
61
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
62
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
63
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
64
+ return pos
65
+
66
+ class PositionEmbeddingSineHW(nn.Module):
67
+ """
68
+ This is a more standard version of the position embedding, very similar to the one
69
+ used by the Attention is all you need paper, generalized to work on images.
70
+ """
71
+ def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
72
+ super().__init__()
73
+ self.num_pos_feats = num_pos_feats
74
+ self.temperatureH = temperatureH
75
+ self.temperatureW = temperatureW
76
+ self.normalize = normalize
77
+ if scale is not None and normalize is False:
78
+ raise ValueError("normalize should be True if scale is passed")
79
+ if scale is None:
80
+ scale = 2 * math.pi
81
+ self.scale = scale
82
+
83
+ def forward(self, tensor_list: NestedTensor):
84
+ x = tensor_list.tensors
85
+ mask = tensor_list.mask
86
+ assert mask is not None
87
+ not_mask = ~mask
88
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
89
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
90
+
91
+ # import ipdb; ipdb.set_trace()
92
+
93
+ if self.normalize:
94
+ eps = 1e-6
95
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
96
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
97
+
98
+ dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
99
+ dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
100
+ pos_x = x_embed[:, :, :, None] / dim_tx
101
+
102
+ dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
103
+ dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
104
+ pos_y = y_embed[:, :, :, None] / dim_ty
105
+
106
+ pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
107
+ pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
108
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
109
+
110
+ # import ipdb; ipdb.set_trace()
111
+
112
+ return pos
113
+
114
+ class PositionEmbeddingLearned(nn.Module):
115
+ """
116
+ Absolute pos embedding, learned.
117
+ """
118
+ def __init__(self, num_pos_feats=256):
119
+ super().__init__()
120
+ self.row_embed = nn.Embedding(50, num_pos_feats)
121
+ self.col_embed = nn.Embedding(50, num_pos_feats)
122
+ self.reset_parameters()
123
+
124
+ def reset_parameters(self):
125
+ nn.init.uniform_(self.row_embed.weight)
126
+ nn.init.uniform_(self.col_embed.weight)
127
+
128
+ def forward(self, tensor_list: NestedTensor):
129
+ x = tensor_list.tensors
130
+ h, w = x.shape[-2:]
131
+ i = torch.arange(w, device=x.device)
132
+ j = torch.arange(h, device=x.device)
133
+ x_emb = self.col_embed(i)
134
+ y_emb = self.row_embed(j)
135
+ pos = torch.cat([
136
+ x_emb.unsqueeze(0).repeat(h, 1, 1),
137
+ y_emb.unsqueeze(1).repeat(1, w, 1),
138
+ ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
139
+ return pos
140
+
141
+
142
+ def build_position_encoding(args):
143
+ N_steps = args.hidden_dim // 2
144
+ if args.position_embedding in ('v2', 'sine'):
145
+ # TODO find a better way of exposing other arguments
146
+ position_embedding = PositionEmbeddingSineHW(
147
+ N_steps,
148
+ temperatureH=args.pe_temperatureH,
149
+ temperatureW=args.pe_temperatureW,
150
+ normalize=True
151
+ )
152
+ elif args.position_embedding in ('v3', 'learned'):
153
+ position_embedding = PositionEmbeddingLearned(N_steps)
154
+ else:
155
+ raise ValueError(f"not supported {args.position_embedding}")
156
+
157
+ return position_embedding
python/utils/dependencies/XPose/models/UniPose/swin_transformer.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import torch.utils.checkpoint as checkpoint
6
+ import numpy as np
7
+
8
+ from util.misc import NestedTensor
9
+ # from timm.models.layers import DropPath, to_2tuple, trunc_normal_
10
+ from src.modules.util import DropPath, to_2tuple, trunc_normal_
11
+
12
+
13
+
14
+ class Mlp(nn.Module):
15
+ """ Multilayer perceptron."""
16
+
17
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
18
+ super().__init__()
19
+ out_features = out_features or in_features
20
+ hidden_features = hidden_features or in_features
21
+ self.fc1 = nn.Linear(in_features, hidden_features)
22
+ self.act = act_layer()
23
+ self.fc2 = nn.Linear(hidden_features, out_features)
24
+ self.drop = nn.Dropout(drop)
25
+
26
+ def forward(self, x):
27
+ x = self.fc1(x)
28
+ x = self.act(x)
29
+ x = self.drop(x)
30
+ x = self.fc2(x)
31
+ x = self.drop(x)
32
+ return x
33
+
34
+
35
+ def window_partition(x, window_size):
36
+ """
37
+ Args:
38
+ x: (B, H, W, C)
39
+ window_size (int): window size
40
+ Returns:
41
+ windows: (num_windows*B, window_size, window_size, C)
42
+ """
43
+ B, H, W, C = x.shape
44
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
45
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
46
+ return windows
47
+
48
+
49
+ def window_reverse(windows, window_size, H, W):
50
+ """
51
+ Args:
52
+ windows: (num_windows*B, window_size, window_size, C)
53
+ window_size (int): Window size
54
+ H (int): Height of image
55
+ W (int): Width of image
56
+ Returns:
57
+ x: (B, H, W, C)
58
+ """
59
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
60
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
61
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
62
+ return x
63
+
64
+
65
+ class WindowAttention(nn.Module):
66
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
67
+ It supports both of shifted and non-shifted window.
68
+ Args:
69
+ dim (int): Number of input channels.
70
+ window_size (tuple[int]): The height and width of the window.
71
+ num_heads (int): Number of attention heads.
72
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
73
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
74
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
75
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
76
+ """
77
+
78
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
79
+
80
+ super().__init__()
81
+ self.dim = dim
82
+ self.window_size = window_size # Wh, Ww
83
+ self.num_heads = num_heads
84
+ head_dim = dim // num_heads
85
+ self.scale = qk_scale or head_dim ** -0.5
86
+
87
+ # define a parameter table of relative position bias
88
+ self.relative_position_bias_table = nn.Parameter(
89
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
90
+
91
+ # get pair-wise relative position index for each token inside the window
92
+ coords_h = torch.arange(self.window_size[0])
93
+ coords_w = torch.arange(self.window_size[1])
94
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
95
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
96
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
97
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
98
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
99
+ relative_coords[:, :, 1] += self.window_size[1] - 1
100
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
101
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
102
+ self.register_buffer("relative_position_index", relative_position_index)
103
+
104
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
105
+ self.attn_drop = nn.Dropout(attn_drop)
106
+ self.proj = nn.Linear(dim, dim)
107
+ self.proj_drop = nn.Dropout(proj_drop)
108
+
109
+ trunc_normal_(self.relative_position_bias_table, std=.02)
110
+ self.softmax = nn.Softmax(dim=-1)
111
+
112
+ def forward(self, x, mask=None):
113
+ """ Forward function.
114
+ Args:
115
+ x: input features with shape of (num_windows*B, N, C)
116
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
117
+ """
118
+ B_, N, C = x.shape
119
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
120
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
121
+
122
+ q = q * self.scale
123
+ attn = (q @ k.transpose(-2, -1))
124
+
125
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
126
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
127
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
128
+ attn = attn + relative_position_bias.unsqueeze(0)
129
+
130
+ if mask is not None:
131
+ nW = mask.shape[0]
132
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
133
+ attn = attn.view(-1, self.num_heads, N, N)
134
+ attn = self.softmax(attn)
135
+ else:
136
+ attn = self.softmax(attn)
137
+
138
+ attn = self.attn_drop(attn)
139
+
140
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
141
+ x = self.proj(x)
142
+ x = self.proj_drop(x)
143
+ return x
144
+
145
+
146
+ class SwinTransformerBlock(nn.Module):
147
+ """ Swin Transformer Block.
148
+ Args:
149
+ dim (int): Number of input channels.
150
+ num_heads (int): Number of attention heads.
151
+ window_size (int): Window size.
152
+ shift_size (int): Shift size for SW-MSA.
153
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
154
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
155
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
156
+ drop (float, optional): Dropout rate. Default: 0.0
157
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
158
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
159
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
160
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
161
+ """
162
+
163
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
164
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
165
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
166
+ super().__init__()
167
+ self.dim = dim
168
+ self.num_heads = num_heads
169
+ self.window_size = window_size
170
+ self.shift_size = shift_size
171
+ self.mlp_ratio = mlp_ratio
172
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
173
+
174
+ self.norm1 = norm_layer(dim)
175
+ self.attn = WindowAttention(
176
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
177
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
178
+
179
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
180
+ self.norm2 = norm_layer(dim)
181
+ mlp_hidden_dim = int(dim * mlp_ratio)
182
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
183
+
184
+ self.H = None
185
+ self.W = None
186
+
187
+ def forward(self, x, mask_matrix):
188
+ """ Forward function.
189
+ Args:
190
+ x: Input feature, tensor size (B, H*W, C).
191
+ H, W: Spatial resolution of the input feature.
192
+ mask_matrix: Attention mask for cyclic shift.
193
+ """
194
+ B, L, C = x.shape
195
+ H, W = self.H, self.W
196
+ assert L == H * W, "input feature has wrong size"
197
+
198
+ shortcut = x
199
+ x = self.norm1(x)
200
+ x = x.view(B, H, W, C)
201
+
202
+ # pad feature maps to multiples of window size
203
+ pad_l = pad_t = 0
204
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
205
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
206
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
207
+ _, Hp, Wp, _ = x.shape
208
+
209
+ # cyclic shift
210
+ if self.shift_size > 0:
211
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
212
+ attn_mask = mask_matrix
213
+ else:
214
+ shifted_x = x
215
+ attn_mask = None
216
+
217
+ # partition windows
218
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
219
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
220
+
221
+ # W-MSA/SW-MSA
222
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
223
+
224
+ # merge windows
225
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
226
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
227
+
228
+ # reverse cyclic shift
229
+ if self.shift_size > 0:
230
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
231
+ else:
232
+ x = shifted_x
233
+
234
+ if pad_r > 0 or pad_b > 0:
235
+ x = x[:, :H, :W, :].contiguous()
236
+
237
+ x = x.view(B, H * W, C)
238
+
239
+ # FFN
240
+ x = shortcut + self.drop_path(x)
241
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
242
+
243
+ return x
244
+
245
+
246
+ class PatchMerging(nn.Module):
247
+ """ Patch Merging Layer
248
+ Args:
249
+ dim (int): Number of input channels.
250
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
251
+ """
252
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
253
+ super().__init__()
254
+ self.dim = dim
255
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
256
+ self.norm = norm_layer(4 * dim)
257
+
258
+ def forward(self, x, H, W):
259
+ """ Forward function.
260
+ Args:
261
+ x: Input feature, tensor size (B, H*W, C).
262
+ H, W: Spatial resolution of the input feature.
263
+ """
264
+ B, L, C = x.shape
265
+ assert L == H * W, "input feature has wrong size"
266
+
267
+ x = x.view(B, H, W, C)
268
+
269
+ # padding
270
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
271
+ if pad_input:
272
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
273
+
274
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
275
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
276
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
277
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
278
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
279
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
280
+
281
+ x = self.norm(x)
282
+ x = self.reduction(x)
283
+
284
+ return x
285
+
286
+
287
+ class BasicLayer(nn.Module):
288
+ """ A basic Swin Transformer layer for one stage.
289
+ Args:
290
+ dim (int): Number of feature channels
291
+ depth (int): Depths of this stage.
292
+ num_heads (int): Number of attention head.
293
+ window_size (int): Local window size. Default: 7.
294
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
295
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
296
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
297
+ drop (float, optional): Dropout rate. Default: 0.0
298
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
299
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
300
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
301
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
302
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
303
+ """
304
+
305
+ def __init__(self,
306
+ dim,
307
+ depth,
308
+ num_heads,
309
+ window_size=7,
310
+ mlp_ratio=4.,
311
+ qkv_bias=True,
312
+ qk_scale=None,
313
+ drop=0.,
314
+ attn_drop=0.,
315
+ drop_path=0.,
316
+ norm_layer=nn.LayerNorm,
317
+ downsample=None,
318
+ use_checkpoint=False):
319
+ super().__init__()
320
+ self.window_size = window_size
321
+ self.shift_size = window_size // 2
322
+ self.depth = depth
323
+ self.use_checkpoint = use_checkpoint
324
+
325
+ # build blocks
326
+ self.blocks = nn.ModuleList([
327
+ SwinTransformerBlock(
328
+ dim=dim,
329
+ num_heads=num_heads,
330
+ window_size=window_size,
331
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
332
+ mlp_ratio=mlp_ratio,
333
+ qkv_bias=qkv_bias,
334
+ qk_scale=qk_scale,
335
+ drop=drop,
336
+ attn_drop=attn_drop,
337
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
338
+ norm_layer=norm_layer)
339
+ for i in range(depth)])
340
+
341
+ # patch merging layer
342
+ if downsample is not None:
343
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
344
+ else:
345
+ self.downsample = None
346
+
347
+ def forward(self, x, H, W):
348
+ """ Forward function.
349
+ Args:
350
+ x: Input feature, tensor size (B, H*W, C).
351
+ H, W: Spatial resolution of the input feature.
352
+ """
353
+
354
+ # calculate attention mask for SW-MSA
355
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
356
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
357
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
358
+ h_slices = (slice(0, -self.window_size),
359
+ slice(-self.window_size, -self.shift_size),
360
+ slice(-self.shift_size, None))
361
+ w_slices = (slice(0, -self.window_size),
362
+ slice(-self.window_size, -self.shift_size),
363
+ slice(-self.shift_size, None))
364
+ cnt = 0
365
+ for h in h_slices:
366
+ for w in w_slices:
367
+ img_mask[:, h, w, :] = cnt
368
+ cnt += 1
369
+
370
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
371
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
372
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
373
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
374
+
375
+ for blk in self.blocks:
376
+ blk.H, blk.W = H, W
377
+ if self.use_checkpoint:
378
+ x = checkpoint.checkpoint(blk, x, attn_mask)
379
+ else:
380
+ x = blk(x, attn_mask)
381
+ if self.downsample is not None:
382
+ x_down = self.downsample(x, H, W)
383
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
384
+ return x, H, W, x_down, Wh, Ww
385
+ else:
386
+ return x, H, W, x, H, W
387
+
388
+
389
+ class PatchEmbed(nn.Module):
390
+ """ Image to Patch Embedding
391
+ Args:
392
+ patch_size (int): Patch token size. Default: 4.
393
+ in_chans (int): Number of input image channels. Default: 3.
394
+ embed_dim (int): Number of linear projection output channels. Default: 96.
395
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
396
+ """
397
+
398
+ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
399
+ super().__init__()
400
+ patch_size = to_2tuple(patch_size)
401
+ self.patch_size = patch_size
402
+
403
+ self.in_chans = in_chans
404
+ self.embed_dim = embed_dim
405
+
406
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
407
+ if norm_layer is not None:
408
+ self.norm = norm_layer(embed_dim)
409
+ else:
410
+ self.norm = None
411
+
412
+ def forward(self, x):
413
+ """Forward function."""
414
+ # padding
415
+ _, _, H, W = x.size()
416
+ if W % self.patch_size[1] != 0:
417
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
418
+ if H % self.patch_size[0] != 0:
419
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
420
+
421
+ x = self.proj(x) # B C Wh Ww
422
+ if self.norm is not None:
423
+ Wh, Ww = x.size(2), x.size(3)
424
+ x = x.flatten(2).transpose(1, 2)
425
+ x = self.norm(x)
426
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
427
+
428
+ return x
429
+
430
+
431
+ class SwinTransformer(nn.Module):
432
+ """ Swin Transformer backbone.
433
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
434
+ https://arxiv.org/pdf/2103.14030
435
+ Args:
436
+ pretrain_img_size (int): Input image size for training the pretrained model,
437
+ used in absolute postion embedding. Default 224.
438
+ patch_size (int | tuple(int)): Patch size. Default: 4.
439
+ in_chans (int): Number of input image channels. Default: 3.
440
+ embed_dim (int): Number of linear projection output channels. Default: 96.
441
+ depths (tuple[int]): Depths of each Swin Transformer stage.
442
+ num_heads (tuple[int]): Number of attention head of each stage.
443
+ window_size (int): Window size. Default: 7.
444
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
445
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
446
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
447
+ drop_rate (float): Dropout rate.
448
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
449
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
450
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
451
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
452
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
453
+ out_indices (Sequence[int]): Output from which stages.
454
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
455
+ -1 means not freezing any parameters.
456
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
457
+ dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
458
+ """
459
+
460
+ def __init__(self,
461
+ pretrain_img_size=224,
462
+ patch_size=4,
463
+ in_chans=3,
464
+ embed_dim=96,
465
+ depths=[2, 2, 6, 2],
466
+ num_heads=[3, 6, 12, 24],
467
+ window_size=7,
468
+ mlp_ratio=4.,
469
+ qkv_bias=True,
470
+ qk_scale=None,
471
+ drop_rate=0.,
472
+ attn_drop_rate=0.,
473
+ drop_path_rate=0.2,
474
+ norm_layer=nn.LayerNorm,
475
+ ape=False,
476
+ patch_norm=True,
477
+ out_indices=(0, 1, 2, 3),
478
+ frozen_stages=-1,
479
+ dilation=False,
480
+ use_checkpoint=False):
481
+ super().__init__()
482
+
483
+ self.pretrain_img_size = pretrain_img_size
484
+ self.num_layers = len(depths)
485
+ self.embed_dim = embed_dim
486
+ self.ape = ape
487
+ self.patch_norm = patch_norm
488
+ self.out_indices = out_indices
489
+ self.frozen_stages = frozen_stages
490
+ self.dilation = dilation
491
+
492
+ # if use_checkpoint:
493
+ # print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
494
+
495
+ # split image into non-overlapping patches
496
+ self.patch_embed = PatchEmbed(
497
+ patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
498
+ norm_layer=norm_layer if self.patch_norm else None)
499
+
500
+ # absolute position embedding
501
+ if self.ape:
502
+ pretrain_img_size = to_2tuple(pretrain_img_size)
503
+ patch_size = to_2tuple(patch_size)
504
+ patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
505
+
506
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
507
+ trunc_normal_(self.absolute_pos_embed, std=.02)
508
+
509
+ self.pos_drop = nn.Dropout(p=drop_rate)
510
+
511
+ # stochastic depth
512
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
513
+
514
+ # build layers
515
+ self.layers = nn.ModuleList()
516
+ # prepare downsample list
517
+ downsamplelist = [PatchMerging for i in range(self.num_layers)]
518
+ downsamplelist[-1] = None
519
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
520
+ if self.dilation:
521
+ downsamplelist[-2] = None
522
+ num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
523
+ for i_layer in range(self.num_layers):
524
+ layer = BasicLayer(
525
+ # dim=int(embed_dim * 2 ** i_layer),
526
+ dim=num_features[i_layer],
527
+ depth=depths[i_layer],
528
+ num_heads=num_heads[i_layer],
529
+ window_size=window_size,
530
+ mlp_ratio=mlp_ratio,
531
+ qkv_bias=qkv_bias,
532
+ qk_scale=qk_scale,
533
+ drop=drop_rate,
534
+ attn_drop=attn_drop_rate,
535
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
536
+ norm_layer=norm_layer,
537
+ # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
538
+ downsample=downsamplelist[i_layer],
539
+ use_checkpoint=use_checkpoint)
540
+ self.layers.append(layer)
541
+
542
+ # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
543
+ self.num_features = num_features
544
+
545
+ # add a norm layer for each output
546
+ for i_layer in out_indices:
547
+ layer = norm_layer(num_features[i_layer])
548
+ layer_name = f'norm{i_layer}'
549
+ self.add_module(layer_name, layer)
550
+
551
+ self._freeze_stages()
552
+
553
+ def _freeze_stages(self):
554
+ if self.frozen_stages >= 0:
555
+ self.patch_embed.eval()
556
+ for param in self.patch_embed.parameters():
557
+ param.requires_grad = False
558
+
559
+ if self.frozen_stages >= 1 and self.ape:
560
+ self.absolute_pos_embed.requires_grad = False
561
+
562
+ if self.frozen_stages >= 2:
563
+ self.pos_drop.eval()
564
+ for i in range(0, self.frozen_stages - 1):
565
+ m = self.layers[i]
566
+ m.eval()
567
+ for param in m.parameters():
568
+ param.requires_grad = False
569
+
570
+
571
+
572
+ def forward_raw(self, x):
573
+ """Forward function."""
574
+ x = self.patch_embed(x)
575
+
576
+ Wh, Ww = x.size(2), x.size(3)
577
+ if self.ape:
578
+ # interpolate the position embedding to the corresponding size
579
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
580
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
581
+ else:
582
+ x = x.flatten(2).transpose(1, 2)
583
+ x = self.pos_drop(x)
584
+
585
+ outs = []
586
+ for i in range(self.num_layers):
587
+ layer = self.layers[i]
588
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
589
+ # import ipdb; ipdb.set_trace()
590
+
591
+ if i in self.out_indices:
592
+ norm_layer = getattr(self, f'norm{i}')
593
+ x_out = norm_layer(x_out)
594
+
595
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
596
+ outs.append(out)
597
+ # in:
598
+ # torch.Size([2, 3, 1024, 1024])
599
+ # outs:
600
+ # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
601
+ # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
602
+ return tuple(outs)
603
+
604
+
605
+ def forward(self, tensor_list: NestedTensor):
606
+ x = tensor_list.tensors
607
+
608
+ """Forward function."""
609
+ x = self.patch_embed(x)
610
+
611
+ Wh, Ww = x.size(2), x.size(3)
612
+ if self.ape:
613
+ # interpolate the position embedding to the corresponding size
614
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
615
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
616
+ else:
617
+ x = x.flatten(2).transpose(1, 2)
618
+ x = self.pos_drop(x)
619
+
620
+ outs = []
621
+ for i in range(self.num_layers):
622
+ layer = self.layers[i]
623
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
624
+
625
+ if i in self.out_indices:
626
+ norm_layer = getattr(self, f'norm{i}')
627
+ x_out = norm_layer(x_out)
628
+
629
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
630
+ outs.append(out)
631
+ # in:
632
+ # torch.Size([2, 3, 1024, 1024])
633
+ # out:
634
+ # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
635
+ # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
636
+
637
+ # collect for nesttensors
638
+ outs_dict = {}
639
+ for idx, out_i in enumerate(outs):
640
+ m = tensor_list.mask
641
+ assert m is not None
642
+ mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
643
+ outs_dict[idx] = NestedTensor(out_i, mask)
644
+
645
+ return outs_dict
646
+
647
+
648
+ def train(self, mode=True):
649
+ """Convert the model into training mode while keep layers freezed."""
650
+ super(SwinTransformer, self).train(mode)
651
+ self._freeze_stages()
652
+
653
+
654
+
655
+ def build_swin_transformer(modelname, pretrain_img_size, **kw):
656
+ assert modelname in ['swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k', 'swin_L_384_22k']
657
+
658
+ model_para_dict = {
659
+ 'swin_T_224_1k': dict(
660
+ embed_dim=96,
661
+ depths=[ 2, 2, 6, 2 ],
662
+ num_heads=[ 3, 6, 12, 24],
663
+ window_size=7
664
+ ),
665
+ 'swin_B_224_22k': dict(
666
+ embed_dim=128,
667
+ depths=[ 2, 2, 18, 2 ],
668
+ num_heads=[ 4, 8, 16, 32 ],
669
+ window_size=7
670
+ ),
671
+ 'swin_B_384_22k': dict(
672
+ embed_dim=128,
673
+ depths=[ 2, 2, 18, 2 ],
674
+ num_heads=[ 4, 8, 16, 32 ],
675
+ window_size=12
676
+ ),
677
+ 'swin_L_224_22k': dict(
678
+ embed_dim=192,
679
+ depths=[ 2, 2, 18, 2 ],
680
+ num_heads=[ 6, 12, 24, 48 ],
681
+ window_size=7
682
+ ),
683
+ 'swin_L_384_22k': dict(
684
+ embed_dim=192,
685
+ depths=[ 2, 2, 18, 2 ],
686
+ num_heads=[ 6, 12, 24, 48 ],
687
+ window_size=12
688
+ ),
689
+ }
690
+ kw_cgf = model_para_dict[modelname]
691
+ kw_cgf.update(kw)
692
+ model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
693
+ return model
694
+
695
+ if __name__ == "__main__":
696
+ model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)
697
+ x = torch.rand(2, 3, 1024, 1024)
698
+ y = model.forward_raw(x)
699
+ import ipdb; ipdb.set_trace()
700
+ x = torch.rand(2, 3, 384, 384)
701
+ y = model.forward_raw(x)
python/utils/dependencies/XPose/models/UniPose/transformer_deformable.py ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Deformable DETR
7
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
8
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
9
+ # ------------------------------------------------------------------------
10
+ # Modified from DETR (https://github.com/facebookresearch/detr)
11
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
12
+ # ------------------------------------------------------------------------
13
+
14
+ import copy
15
+ import math
16
+ import torch
17
+ from torch import nn, Tensor
18
+ from torch.nn.init import xavier_uniform_, constant_, normal_
19
+ from typing import Optional
20
+
21
+ from util.misc import inverse_sigmoid
22
+ from .ops.modules import MSDeformAttn
23
+ from .utils import MLP, _get_activation_fn, gen_sineembed_for_position
24
+
25
+ class DeformableTransformer(nn.Module):
26
+ def __init__(self, d_model=256, nhead=8,
27
+ num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
28
+ activation="relu", return_intermediate_dec=False,
29
+ num_feature_levels=4, dec_n_points=4, enc_n_points=4,
30
+ two_stage=False, two_stage_num_proposals=300,
31
+ use_dab=False, high_dim_query_update=False, no_sine_embed=False):
32
+ super().__init__()
33
+
34
+ self.d_model = d_model
35
+ self.nhead = nhead
36
+ self.two_stage = two_stage
37
+ self.two_stage_num_proposals = two_stage_num_proposals
38
+ self.use_dab = use_dab
39
+
40
+ encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
41
+ dropout, activation,
42
+ num_feature_levels, nhead, enc_n_points)
43
+ self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
44
+
45
+ decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
46
+ dropout, activation,
47
+ num_feature_levels, nhead, dec_n_points)
48
+ self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec,
49
+ use_dab=use_dab, d_model=d_model, high_dim_query_update=high_dim_query_update, no_sine_embed=no_sine_embed)
50
+
51
+ self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
52
+
53
+ if two_stage:
54
+ self.enc_output = nn.Linear(d_model, d_model)
55
+ self.enc_output_norm = nn.LayerNorm(d_model)
56
+ self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
57
+ self.pos_trans_norm = nn.LayerNorm(d_model * 2)
58
+ else:
59
+ if not self.use_dab:
60
+ self.reference_points = nn.Linear(d_model, 2)
61
+
62
+ self.high_dim_query_update = high_dim_query_update
63
+ if high_dim_query_update:
64
+ assert not self.use_dab, "use_dab must be True"
65
+
66
+ self._reset_parameters()
67
+
68
+ def _reset_parameters(self):
69
+ for p in self.parameters():
70
+ if p.dim() > 1:
71
+ nn.init.xavier_uniform_(p)
72
+ for m in self.modules():
73
+ if isinstance(m, MSDeformAttn):
74
+ m._reset_parameters()
75
+ if not self.two_stage and not self.use_dab:
76
+ xavier_uniform_(self.reference_points.weight.data, gain=1.0)
77
+ constant_(self.reference_points.bias.data, 0.)
78
+ normal_(self.level_embed)
79
+
80
+ def get_proposal_pos_embed(self, proposals):
81
+ num_pos_feats = 128
82
+ temperature = 10000
83
+ scale = 2 * math.pi
84
+
85
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
86
+ dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
87
+ # N, L, 4
88
+ proposals = proposals.sigmoid() * scale
89
+ # N, L, 4, 128
90
+ pos = proposals[:, :, :, None] / dim_t
91
+ # N, L, 4, 64, 2
92
+ pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
93
+ return pos
94
+
95
+ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
96
+ N_, S_, C_ = memory.shape
97
+ base_scale = 4.0
98
+ proposals = []
99
+ _cur = 0
100
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
101
+ mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
102
+ valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
103
+ valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
104
+
105
+ grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
106
+ torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
107
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
108
+
109
+ scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
110
+ grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
111
+ wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
112
+ proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
113
+ proposals.append(proposal)
114
+ _cur += (H_ * W_)
115
+ output_proposals = torch.cat(proposals, 1)
116
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
117
+ output_proposals = torch.log(output_proposals / (1 - output_proposals))
118
+ output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
119
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
120
+
121
+ output_memory = memory
122
+ output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
123
+ output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
124
+ output_memory = self.enc_output_norm(self.enc_output(output_memory))
125
+ return output_memory, output_proposals
126
+
127
+ def get_valid_ratio(self, mask):
128
+ _, H, W = mask.shape
129
+ valid_H = torch.sum(~mask[:, :, 0], 1)
130
+ valid_W = torch.sum(~mask[:, 0, :], 1)
131
+ valid_ratio_h = valid_H.float() / H
132
+ valid_ratio_w = valid_W.float() / W
133
+ valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
134
+ return valid_ratio
135
+
136
+ def forward(self, srcs, masks, pos_embeds, query_embed=None):
137
+ """
138
+ Input:
139
+ - srcs: List([bs, c, h, w])
140
+ - masks: List([bs, h, w])
141
+ """
142
+ assert self.two_stage or query_embed is not None
143
+
144
+ # prepare input for encoder
145
+ src_flatten = []
146
+ mask_flatten = []
147
+ lvl_pos_embed_flatten = []
148
+ spatial_shapes = []
149
+ for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
150
+ bs, c, h, w = src.shape
151
+ spatial_shape = (h, w)
152
+ spatial_shapes.append(spatial_shape)
153
+
154
+ src = src.flatten(2).transpose(1, 2) # bs, hw, c
155
+ mask = mask.flatten(1) # bs, hw
156
+ pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c
157
+ lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
158
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
159
+ src_flatten.append(src)
160
+ mask_flatten.append(mask)
161
+ src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c
162
+ mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw}
163
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
164
+ spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
165
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
166
+ valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
167
+
168
+ # encoder
169
+ memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
170
+ # import ipdb; ipdb.set_trace()
171
+
172
+ # prepare input for decoder
173
+ bs, _, c = memory.shape
174
+ if self.two_stage:
175
+ output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
176
+
177
+ # hack implementation for two-stage Deformable DETR
178
+ enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
179
+ enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
180
+
181
+ topk = self.two_stage_num_proposals
182
+ topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
183
+ topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
184
+ topk_coords_unact = topk_coords_unact.detach()
185
+ reference_points = topk_coords_unact.sigmoid()
186
+ init_reference_out = reference_points
187
+ pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
188
+ query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
189
+ elif self.use_dab:
190
+ reference_points = query_embed[..., self.d_model:].sigmoid()
191
+ tgt = query_embed[..., :self.d_model]
192
+ tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
193
+ init_reference_out = reference_points
194
+ else:
195
+ query_embed, tgt = torch.split(query_embed, c, dim=1)
196
+ query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
197
+ tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
198
+ reference_points = self.reference_points(query_embed).sigmoid()
199
+ # bs, num_quires, 2
200
+ init_reference_out = reference_points
201
+
202
+ # decoder
203
+ # import ipdb; ipdb.set_trace()
204
+ hs, inter_references = self.decoder(tgt, reference_points, memory,
205
+ spatial_shapes, level_start_index, valid_ratios,
206
+ query_pos=query_embed if not self.use_dab else None,
207
+ src_padding_mask=mask_flatten)
208
+
209
+ inter_references_out = inter_references
210
+ if self.two_stage:
211
+ return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact
212
+ return hs, init_reference_out, inter_references_out, None, None
213
+
214
+
215
+ class DeformableTransformerEncoderLayer(nn.Module):
216
+ def __init__(self,
217
+ d_model=256, d_ffn=1024,
218
+ dropout=0.1, activation="relu",
219
+ n_levels=4, n_heads=8, n_points=4,
220
+ add_channel_attention=False,
221
+ use_deformable_box_attn=False,
222
+ box_attn_type='roi_align',
223
+ ):
224
+ super().__init__()
225
+
226
+ # self attention
227
+ if use_deformable_box_attn:
228
+ self.self_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
229
+ else:
230
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
231
+ self.dropout1 = nn.Dropout(dropout)
232
+ self.norm1 = nn.LayerNorm(d_model)
233
+
234
+ # ffn
235
+ self.linear1 = nn.Linear(d_model, d_ffn)
236
+ self.activation = _get_activation_fn(activation, d_model=d_ffn)
237
+ self.dropout2 = nn.Dropout(dropout)
238
+ self.linear2 = nn.Linear(d_ffn, d_model)
239
+ self.dropout3 = nn.Dropout(dropout)
240
+ self.norm2 = nn.LayerNorm(d_model)
241
+
242
+ # channel attention
243
+ self.add_channel_attention = add_channel_attention
244
+ if add_channel_attention:
245
+ self.activ_channel = _get_activation_fn('dyrelu', d_model=d_model)
246
+ self.norm_channel = nn.LayerNorm(d_model)
247
+
248
+ @staticmethod
249
+ def with_pos_embed(tensor, pos):
250
+ return tensor if pos is None else tensor + pos
251
+
252
+ def forward_ffn(self, src):
253
+ src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
254
+ src = src + self.dropout3(src2)
255
+ src = self.norm2(src)
256
+ return src
257
+
258
+ def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None):
259
+ # self attention
260
+ # import ipdb; ipdb.set_trace()
261
+ src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, key_padding_mask)
262
+ src = src + self.dropout1(src2)
263
+ src = self.norm1(src)
264
+
265
+ # ffn
266
+ src = self.forward_ffn(src)
267
+
268
+ # channel attn
269
+ if self.add_channel_attention:
270
+ src = self.norm_channel(src + self.activ_channel(src))
271
+
272
+ return src
273
+
274
+
275
+ class DeformableTransformerEncoder(nn.Module):
276
+ def __init__(self, encoder_layer, num_layers, norm=None):
277
+ super().__init__()
278
+ if num_layers > 0:
279
+ self.layers = _get_clones(encoder_layer, num_layers)
280
+ else:
281
+ self.layers = []
282
+ del encoder_layer
283
+ self.num_layers = num_layers
284
+ self.norm = norm
285
+
286
+ @staticmethod
287
+ def get_reference_points(spatial_shapes, valid_ratios, device):
288
+ reference_points_list = []
289
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
290
+
291
+ ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
292
+ torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
293
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
294
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
295
+ ref = torch.stack((ref_x, ref_y), -1)
296
+ reference_points_list.append(ref)
297
+ reference_points = torch.cat(reference_points_list, 1)
298
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
299
+ return reference_points
300
+
301
+ def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
302
+ """
303
+ Input:
304
+ - src: [bs, sum(hi*wi), 256]
305
+ - spatial_shapes: h,w of each level [num_level, 2]
306
+ - level_start_index: [num_level] start point of level in sum(hi*wi).
307
+ - valid_ratios: [bs, num_level, 2]
308
+ - pos: pos embed for src. [bs, sum(hi*wi), 256]
309
+ - padding_mask: [bs, sum(hi*wi)]
310
+ Intermedia:
311
+ - reference_points: [bs, sum(hi*wi), num_lebel, 2]
312
+ """
313
+ output = src
314
+ # bs, sum(hi*wi), 256
315
+ # import ipdb; ipdb.set_trace()
316
+ if self.num_layers > 0:
317
+ reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
318
+ for _, layer in enumerate(self.layers):
319
+ output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
320
+
321
+ if self.norm is not None:
322
+ output = self.norm(output)
323
+
324
+ return output
325
+
326
+
327
+ class DeformableTransformerDecoderLayer(nn.Module):
328
+ def __init__(self, d_model=256, d_ffn=1024,
329
+ dropout=0.1, activation="relu",
330
+ n_levels=4, n_heads=8, n_points=4,
331
+ use_deformable_box_attn=False,
332
+ box_attn_type='roi_align',
333
+ key_aware_type=None,
334
+ decoder_sa_type='ca',
335
+ module_seq=['sa', 'ca', 'ffn'],
336
+ ):
337
+ super().__init__()
338
+ self.module_seq = module_seq
339
+ assert sorted(module_seq) == ['ca', 'ffn', 'sa']
340
+
341
+ # cross attention
342
+ # self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
343
+ if use_deformable_box_attn:
344
+ self.cross_attn = MSDeformableBoxAttention(d_model, n_levels, n_heads, n_boxes=n_points, used_func=box_attn_type)
345
+ else:
346
+ self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
347
+ self.dropout1 = nn.Dropout(dropout)
348
+ self.norm1 = nn.LayerNorm(d_model)
349
+
350
+ # self attention
351
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
352
+ self.dropout2 = nn.Dropout(dropout)
353
+ self.norm2 = nn.LayerNorm(d_model)
354
+
355
+ # ffn
356
+ self.linear1 = nn.Linear(d_model, d_ffn)
357
+ self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
358
+ self.dropout3 = nn.Dropout(dropout)
359
+ self.linear2 = nn.Linear(d_ffn, d_model)
360
+ self.dropout4 = nn.Dropout(dropout)
361
+ self.norm3 = nn.LayerNorm(d_model)
362
+
363
+ self.key_aware_type = key_aware_type
364
+ self.key_aware_proj = None
365
+ self.decoder_sa_type = decoder_sa_type
366
+ assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
367
+
368
+ if decoder_sa_type == 'ca_content':
369
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
370
+
371
+
372
+
373
+
374
+ def rm_self_attn_modules(self):
375
+ self.self_attn = None
376
+ self.dropout2 = None
377
+ self.norm2 = None
378
+
379
+
380
+ @staticmethod
381
+ def with_pos_embed(tensor, pos):
382
+ return tensor if pos is None else tensor + pos
383
+
384
+ def forward_ffn(self, tgt):
385
+ tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
386
+ tgt = tgt + self.dropout4(tgt2)
387
+ tgt = self.norm3(tgt)
388
+ return tgt
389
+
390
+ def forward_sa(self,
391
+ # for tgt
392
+ tgt: Optional[Tensor], # nq, bs, d_model
393
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
394
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
395
+ tgt_key_padding_mask: Optional[Tensor] = None,
396
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
397
+
398
+ # for memory
399
+ memory: Optional[Tensor] = None, # hw, bs, d_model
400
+ memory_key_padding_mask: Optional[Tensor] = None,
401
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
402
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
403
+ memory_pos: Optional[Tensor] = None, # pos for memory
404
+
405
+ # sa
406
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
407
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
408
+ ):
409
+ # self attention
410
+ if self.self_attn is not None:
411
+ # import ipdb; ipdb.set_trace()
412
+ if self.decoder_sa_type == 'sa':
413
+ q = k = self.with_pos_embed(tgt, tgt_query_pos)
414
+ tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
415
+ tgt = tgt + self.dropout2(tgt2)
416
+ tgt = self.norm2(tgt)
417
+ elif self.decoder_sa_type == 'ca_label':
418
+ # import ipdb; ipdb.set_trace()
419
+ # q = self.with_pos_embed(tgt, tgt_query_pos)
420
+ bs = tgt.shape[1]
421
+ k = v = self.label_embedding.weight[:, None, :].repeat(1, bs, 1)
422
+ tgt2 = self.self_attn(tgt, k, v, attn_mask=self_attn_mask)[0]
423
+ tgt = tgt + self.dropout2(tgt2)
424
+ tgt = self.norm2(tgt)
425
+ elif self.decoder_sa_type == 'ca_content':
426
+ tgt2 = self.self_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
427
+ tgt_reference_points.transpose(0, 1).contiguous(),
428
+ memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
429
+ tgt = tgt + self.dropout2(tgt2)
430
+ tgt = self.norm2(tgt)
431
+ else:
432
+ raise NotImplementedError("Unknown decoder_sa_type {}".format(self.decoder_sa_type))
433
+
434
+ return tgt
435
+
436
+ def forward_ca(self,
437
+ # for tgt
438
+ tgt: Optional[Tensor], # nq, bs, d_model
439
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
440
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
441
+ tgt_key_padding_mask: Optional[Tensor] = None,
442
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
443
+
444
+ # for memory
445
+ memory: Optional[Tensor] = None, # hw, bs, d_model
446
+ memory_key_padding_mask: Optional[Tensor] = None,
447
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
448
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
449
+ memory_pos: Optional[Tensor] = None, # pos for memory
450
+
451
+ # sa
452
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
453
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
454
+ ):
455
+ # cross attention
456
+ # import ipdb; ipdb.set_trace()
457
+ if self.key_aware_type is not None:
458
+
459
+ if self.key_aware_type == 'mean':
460
+ tgt = tgt + memory.mean(0, keepdim=True)
461
+ elif self.key_aware_type == 'proj_mean':
462
+ tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
463
+ else:
464
+ raise NotImplementedError("Unknown key_aware_type: {}".format(self.key_aware_type))
465
+ tgt2 = self.cross_attn(self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
466
+ tgt_reference_points.transpose(0, 1).contiguous(),
467
+ memory.transpose(0, 1), memory_spatial_shapes, memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
468
+ tgt = tgt + self.dropout1(tgt2)
469
+ tgt = self.norm1(tgt)
470
+
471
+ return tgt
472
+
473
+ def forward(self,
474
+ # for tgt
475
+ tgt: Optional[Tensor], # nq, bs, d_model
476
+ tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
477
+ tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
478
+ tgt_key_padding_mask: Optional[Tensor] = None,
479
+ tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
480
+
481
+ # for memory
482
+ memory: Optional[Tensor] = None, # hw, bs, d_model
483
+ memory_key_padding_mask: Optional[Tensor] = None,
484
+ memory_level_start_index: Optional[Tensor] = None, # num_levels
485
+ memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
486
+ memory_pos: Optional[Tensor] = None, # pos for memory
487
+
488
+ # sa
489
+ self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
490
+ cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
491
+ ):
492
+
493
+ for funcname in self.module_seq:
494
+ # if os.environ.get('IPDB_DEBUG_SHILONG') == 'INFO':
495
+ # import ipdb; ipdb.set_trace()
496
+ if funcname == 'ffn':
497
+ tgt = self.forward_ffn(tgt)
498
+ elif funcname == 'ca':
499
+ tgt = self.forward_ca(tgt, tgt_query_pos, tgt_query_sine_embed, \
500
+ tgt_key_padding_mask, tgt_reference_points, \
501
+ memory, memory_key_padding_mask, memory_level_start_index, \
502
+ memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
503
+ elif funcname == 'sa':
504
+ tgt = self.forward_sa(tgt, tgt_query_pos, tgt_query_sine_embed, \
505
+ tgt_key_padding_mask, tgt_reference_points, \
506
+ memory, memory_key_padding_mask, memory_level_start_index, \
507
+ memory_spatial_shapes, memory_pos, self_attn_mask, cross_attn_mask)
508
+ else:
509
+ raise ValueError('unknown funcname {}'.format(funcname))
510
+
511
+ return tgt
512
+
513
+
514
+
515
+ class DeformableTransformerDecoder(nn.Module):
516
+ def __init__(self, decoder_layer, num_layers, return_intermediate=False, use_dab=False, d_model=256, query_dim=4):
517
+ super().__init__()
518
+ self.layers = _get_clones(decoder_layer, num_layers)
519
+ self.num_layers = num_layers
520
+ self.return_intermediate = return_intermediate
521
+ assert return_intermediate
522
+ # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
523
+ self.bbox_embed = None
524
+ self.class_embed = None
525
+ self.use_dab = use_dab
526
+ self.d_model = d_model
527
+ self.query_dim = query_dim
528
+ if use_dab:
529
+ self.query_scale = MLP(d_model, d_model, d_model, 2)
530
+ self.ref_point_head = MLP(2 * d_model, d_model, d_model, 2)
531
+
532
+
533
+ def forward(self, tgt, reference_points, src, src_spatial_shapes,
534
+ src_level_start_index, src_valid_ratios,
535
+ query_pos=None, src_padding_mask=None):
536
+ output = tgt
537
+ if self.use_dab:
538
+ assert query_pos is None
539
+
540
+ intermediate = []
541
+ intermediate_reference_points = [reference_points]
542
+ for layer_id, layer in enumerate(self.layers):
543
+ # import ipdb; ipdb.set_trace()
544
+ if reference_points.shape[-1] == 4:
545
+ reference_points_input = reference_points[:, :, None] \
546
+ * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] # bs, nq, 4, 4
547
+ else:
548
+ assert reference_points.shape[-1] == 2
549
+ reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
550
+
551
+ if self.use_dab:
552
+ # import ipdb; ipdb.set_trace()
553
+ query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # bs, nq, 256*2
554
+ raw_query_pos = self.ref_point_head(query_sine_embed) # bs, nq, 256
555
+ pos_scale = self.query_scale(output) if layer_id != 0 else 1
556
+ query_pos = pos_scale * raw_query_pos
557
+
558
+ output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
559
+
560
+ # hack implementation for iterative bounding box refinement
561
+ if self.bbox_embed is not None:
562
+ box_holder = self.bbox_embed(output)
563
+ box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
564
+ new_reference_points = box_holder[..., :self.query_dim].sigmoid()
565
+ reference_points = new_reference_points.detach()
566
+ if layer_id != self.num_layers - 1:
567
+ intermediate_reference_points.append(new_reference_points)
568
+
569
+ intermediate.append(output)
570
+
571
+ return torch.stack(intermediate), torch.stack(intermediate_reference_points)
572
+
573
+
574
+ def _get_clones(module, N):
575
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
576
+
577
+
578
+ def build_deforamble_transformer(args):
579
+ return DeformableTransformer(
580
+ d_model=args.hidden_dim,
581
+ nhead=args.nheads,
582
+ num_encoder_layers=args.enc_layers,
583
+ num_decoder_layers=args.dec_layers,
584
+ dim_feedforward=args.dim_feedforward,
585
+ dropout=args.dropout,
586
+ activation="relu",
587
+ return_intermediate_dec=True,
588
+ num_feature_levels=args.ddetr_num_feature_levels,
589
+ dec_n_points=args.ddetr_dec_n_points,
590
+ enc_n_points=args.ddetr_enc_n_points,
591
+ two_stage=args.ddetr_two_stage,
592
+ two_stage_num_proposals=args.num_queries,
593
+ use_dab=args.ddetr_use_dab,
594
+ high_dim_query_update=args.ddetr_high_dim_query_update,
595
+ no_sine_embed=args.ddetr_no_sine_embed)
python/utils/dependencies/XPose/models/UniPose/transformer_vanilla.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
2
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
3
+ """
4
+ DETR Transformer class.
5
+
6
+ Copy-paste from torch.nn.Transformer with modifications:
7
+ * positional encodings are passed in MHattention
8
+ * extra LN at the end of encoder is removed
9
+ * decoder returns a stack of activations from all decoding layers
10
+ """
11
+ import torch
12
+ from torch import Tensor, nn
13
+ from typing import List, Optional
14
+
15
+ from .utils import _get_activation_fn, _get_clones
16
+
17
+
18
+ class TextTransformer(nn.Module):
19
+ def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
20
+ super().__init__()
21
+ self.num_layers = num_layers
22
+ self.d_model = d_model
23
+ self.nheads = nheads
24
+ self.dim_feedforward = dim_feedforward
25
+ self.norm = None
26
+
27
+ single_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout)
28
+ self.layers = _get_clones(single_encoder_layer, num_layers)
29
+
30
+
31
+ def forward(self, memory_text:torch.Tensor, text_attention_mask:torch.Tensor):
32
+ """
33
+
34
+ Args:
35
+ text_attention_mask: bs, num_token
36
+ memory_text: bs, num_token, d_model
37
+
38
+ Raises:
39
+ RuntimeError: _description_
40
+
41
+ Returns:
42
+ output: bs, num_token, d_model
43
+ """
44
+
45
+ output = memory_text.transpose(0, 1)
46
+
47
+ for layer in self.layers:
48
+ output = layer(output, src_key_padding_mask=text_attention_mask)
49
+
50
+ if self.norm is not None:
51
+ output = self.norm(output)
52
+
53
+ return output.transpose(0, 1)
54
+
55
+
56
+
57
+
58
+ class TransformerEncoderLayer(nn.Module):
59
+ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False):
60
+ super().__init__()
61
+ self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
62
+ # Implementation of Feedforward model
63
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
64
+ self.dropout = nn.Dropout(dropout)
65
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
66
+
67
+ self.norm1 = nn.LayerNorm(d_model)
68
+ self.norm2 = nn.LayerNorm(d_model)
69
+ self.dropout1 = nn.Dropout(dropout)
70
+ self.dropout2 = nn.Dropout(dropout)
71
+
72
+ self.activation = _get_activation_fn(activation)
73
+ self.normalize_before = normalize_before
74
+ self.nhead = nhead
75
+
76
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
77
+ return tensor if pos is None else tensor + pos
78
+
79
+ def forward(
80
+ self,
81
+ src,
82
+ src_mask: Optional[Tensor] = None,
83
+ src_key_padding_mask: Optional[Tensor] = None,
84
+ pos: Optional[Tensor] = None,
85
+ ):
86
+ # repeat attn mask
87
+ if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
88
+ # bs, num_q, num_k
89
+ src_mask = src_mask.repeat(self.nhead, 1, 1)
90
+
91
+ q = k = self.with_pos_embed(src, pos)
92
+
93
+ src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
94
+
95
+ # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
96
+ src = src + self.dropout1(src2)
97
+ src = self.norm1(src)
98
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
99
+ src = src + self.dropout2(src2)
100
+ src = self.norm2(src)
101
+ return src
102
+
python/utils/dependencies/XPose/models/UniPose/unipose.py ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
7
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
8
+ # ------------------------------------------------------------------------
9
+ import os
10
+ import copy
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from torch import nn
14
+ from typing import List
15
+
16
+ from util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
17
+ from util.misc import NestedTensor, nested_tensor_from_tensor_list,inverse_sigmoid
18
+
19
+ from .utils import MLP
20
+ from .backbone import build_backbone
21
+ from ..registry import MODULE_BUILD_FUNCS
22
+ from .mask_generate import prepare_for_mask, post_process
23
+ from .deformable_transformer import build_deformable_transformer
24
+
25
+
26
+ class UniPose(nn.Module):
27
+ """ This is the Cross-Attention Detector module that performs object detection """
28
+
29
+ def __init__(self, backbone, transformer, num_classes, num_queries,
30
+ aux_loss=False, iter_update=False,
31
+ query_dim=2,
32
+ random_refpoints_xy=False,
33
+ fix_refpoints_hw=-1,
34
+ num_feature_levels=1,
35
+ nheads=8,
36
+ # two stage
37
+ two_stage_type='no', # ['no', 'standard']
38
+ two_stage_add_query_num=0,
39
+ dec_pred_class_embed_share=True,
40
+ dec_pred_bbox_embed_share=True,
41
+ two_stage_class_embed_share=True,
42
+ two_stage_bbox_embed_share=True,
43
+ decoder_sa_type='sa',
44
+ num_patterns=0,
45
+ dn_number=100,
46
+ dn_box_noise_scale=0.4,
47
+ dn_label_noise_ratio=0.5,
48
+ dn_labelbook_size=100,
49
+ use_label_enc=True,
50
+
51
+ text_encoder_type='bert-base-uncased',
52
+
53
+ binary_query_selection=False,
54
+ use_cdn=True,
55
+ sub_sentence_present=True,
56
+ num_body_points=68,
57
+ num_box_decoder_layers=2,
58
+ ):
59
+ """ Initializes the model.
60
+ Parameters:
61
+ backbone: torch module of the backbone to be used. See backbone.py
62
+ transformer: torch module of the transformer architecture. See transformer.py
63
+ num_classes: number of object classes
64
+ num_queries: number of object queries, ie detection slot. This is the maximal number of objects
65
+ Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
66
+ aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
67
+
68
+ fix_refpoints_hw: -1(default): learn w and h for each box seperately
69
+ >0 : given fixed number
70
+ -2 : learn a shared w and h
71
+ """
72
+ super().__init__()
73
+ self.num_queries = num_queries
74
+ self.transformer = transformer
75
+ self.num_classes = num_classes
76
+ self.hidden_dim = hidden_dim = transformer.d_model
77
+ self.num_feature_levels = num_feature_levels
78
+ self.nheads = nheads
79
+ self.use_label_enc = use_label_enc
80
+ if use_label_enc:
81
+ self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
82
+ else:
83
+ raise NotImplementedError
84
+ self.label_enc = None
85
+ self.max_text_len = 256
86
+ self.binary_query_selection = binary_query_selection
87
+ self.sub_sentence_present = sub_sentence_present
88
+
89
+ # setting query dim
90
+ self.query_dim = query_dim
91
+ assert query_dim == 4
92
+ self.random_refpoints_xy = random_refpoints_xy
93
+ self.fix_refpoints_hw = fix_refpoints_hw
94
+
95
+ # for dn training
96
+ self.num_patterns = num_patterns
97
+ self.dn_number = dn_number
98
+ self.dn_box_noise_scale = dn_box_noise_scale
99
+ self.dn_label_noise_ratio = dn_label_noise_ratio
100
+ self.dn_labelbook_size = dn_labelbook_size
101
+ self.use_cdn = use_cdn
102
+
103
+
104
+ self.projection = MLP(512, hidden_dim, hidden_dim, 3)
105
+
106
+ self.projection_kpt = MLP(512, hidden_dim, hidden_dim, 3)
107
+
108
+
109
+ device = "cuda" if torch.cuda.is_available() else "cpu"
110
+ # model, _ = clip.load("ViT-B/32", device=device)
111
+ # self.clip_model = model
112
+ # visual_parameters = list(self.clip_model.visual.parameters())
113
+ # #
114
+ # for param in visual_parameters:
115
+ # param.requires_grad = False
116
+
117
+ self.pos_proj = nn.Linear(hidden_dim, 768)
118
+ self.padding = nn.Embedding(1, 768)
119
+
120
+ # prepare input projection layers
121
+ if num_feature_levels > 1:
122
+ num_backbone_outs = len(backbone.num_channels)
123
+ input_proj_list = []
124
+ for _ in range(num_backbone_outs):
125
+ in_channels = backbone.num_channels[_]
126
+ input_proj_list.append(nn.Sequential(
127
+ nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
128
+ nn.GroupNorm(32, hidden_dim),
129
+ ))
130
+ for _ in range(num_feature_levels - num_backbone_outs):
131
+ input_proj_list.append(nn.Sequential(
132
+ nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
133
+ nn.GroupNorm(32, hidden_dim),
134
+ ))
135
+ in_channels = hidden_dim
136
+ self.input_proj = nn.ModuleList(input_proj_list)
137
+ else:
138
+ assert two_stage_type == 'no', "two_stage_type should be no if num_feature_levels=1 !!!"
139
+ self.input_proj = nn.ModuleList([
140
+ nn.Sequential(
141
+ nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),
142
+ nn.GroupNorm(32, hidden_dim),
143
+ )])
144
+
145
+ self.backbone = backbone
146
+ self.aux_loss = aux_loss
147
+ self.box_pred_damping = box_pred_damping = None
148
+
149
+ self.iter_update = iter_update
150
+ assert iter_update, "Why not iter_update?"
151
+
152
+ # prepare pred layers
153
+ self.dec_pred_class_embed_share = dec_pred_class_embed_share
154
+ self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
155
+ # prepare class & box embed
156
+ _class_embed = ContrastiveAssign()
157
+
158
+
159
+
160
+ _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
161
+ nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
162
+ nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
163
+
164
+ _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
165
+ _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
166
+ nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
167
+ nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
168
+
169
+ if dec_pred_bbox_embed_share:
170
+ box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
171
+ else:
172
+ box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)]
173
+ if dec_pred_class_embed_share:
174
+ class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
175
+ else:
176
+ class_embed_layerlist = [copy.deepcopy(_class_embed) for i in range(transformer.num_decoder_layers)]
177
+
178
+
179
+ if dec_pred_bbox_embed_share:
180
+
181
+ pose_embed_layerlist = [_pose_embed for i in
182
+ range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
183
+ else:
184
+ pose_embed_layerlist = [copy.deepcopy(_pose_embed) for i in
185
+ range(transformer.num_decoder_layers - num_box_decoder_layers + 1)]
186
+
187
+ pose_hw_embed_layerlist = [_pose_hw_embed for i in
188
+ range(transformer.num_decoder_layers - num_box_decoder_layers)]
189
+
190
+
191
+ self.num_box_decoder_layers = num_box_decoder_layers
192
+ self.bbox_embed = nn.ModuleList(box_embed_layerlist)
193
+ self.class_embed = nn.ModuleList(class_embed_layerlist)
194
+ self.num_body_points = num_body_points
195
+ self.pose_embed = nn.ModuleList(pose_embed_layerlist)
196
+ self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
197
+
198
+ self.transformer.decoder.bbox_embed = self.bbox_embed
199
+ self.transformer.decoder.class_embed = self.class_embed
200
+
201
+ self.transformer.decoder.pose_embed = self.pose_embed
202
+ self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
203
+
204
+ self.transformer.decoder.num_body_points = num_body_points
205
+
206
+
207
+ # two stage
208
+ self.two_stage_type = two_stage_type
209
+ self.two_stage_add_query_num = two_stage_add_query_num
210
+ assert two_stage_type in ['no', 'standard'], "unknown param {} of two_stage_type".format(two_stage_type)
211
+ if two_stage_type != 'no':
212
+ if two_stage_bbox_embed_share:
213
+ assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
214
+ self.transformer.enc_out_bbox_embed = _bbox_embed
215
+ else:
216
+ self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
217
+
218
+ if two_stage_class_embed_share:
219
+ assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
220
+ self.transformer.enc_out_class_embed = _class_embed
221
+ else:
222
+ self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
223
+
224
+ self.refpoint_embed = None
225
+ if self.two_stage_add_query_num > 0:
226
+ self.init_ref_points(two_stage_add_query_num)
227
+
228
+ self.decoder_sa_type = decoder_sa_type
229
+ assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
230
+ # self.replace_sa_with_double_ca = replace_sa_with_double_ca
231
+ if decoder_sa_type == 'ca_label':
232
+ self.label_embedding = nn.Embedding(num_classes, hidden_dim)
233
+ for layer in self.transformer.decoder.layers:
234
+ layer.label_embedding = self.label_embedding
235
+ else:
236
+ for layer in self.transformer.decoder.layers:
237
+ layer.label_embedding = None
238
+ self.label_embedding = None
239
+
240
+ self._reset_parameters()
241
+
242
+ def open_set_transfer_init(self):
243
+ for name, param in self.named_parameters():
244
+ if 'fusion_layers' in name:
245
+ continue
246
+ if 'ca_text' in name:
247
+ continue
248
+ if 'catext_norm' in name:
249
+ continue
250
+ if 'catext_dropout' in name:
251
+ continue
252
+ if "text_layers" in name:
253
+ continue
254
+ if 'bert' in name:
255
+ continue
256
+ if 'bbox_embed' in name:
257
+ continue
258
+ if 'label_enc.weight' in name:
259
+ continue
260
+ if 'feat_map' in name:
261
+ continue
262
+ if 'enc_output' in name:
263
+ continue
264
+
265
+ param.requires_grad_(False)
266
+
267
+ # import ipdb; ipdb.set_trace()
268
+
269
+ def _reset_parameters(self):
270
+ # init input_proj
271
+ for proj in self.input_proj:
272
+ nn.init.xavier_uniform_(proj[0].weight, gain=1)
273
+ nn.init.constant_(proj[0].bias, 0)
274
+
275
+ def init_ref_points(self, use_num_queries):
276
+ self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
277
+
278
+ if self.random_refpoints_xy:
279
+ # import ipdb; ipdb.set_trace()
280
+ self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
281
+ self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
282
+ self.refpoint_embed.weight.data[:, :2].requires_grad = False
283
+
284
+ if self.fix_refpoints_hw > 0:
285
+ print("fix_refpoints_hw: {}".format(self.fix_refpoints_hw))
286
+ assert self.random_refpoints_xy
287
+ self.refpoint_embed.weight.data[:, 2:] = self.fix_refpoints_hw
288
+ self.refpoint_embed.weight.data[:, 2:] = inverse_sigmoid(self.refpoint_embed.weight.data[:, 2:])
289
+ self.refpoint_embed.weight.data[:, 2:].requires_grad = False
290
+ elif int(self.fix_refpoints_hw) == -1:
291
+ pass
292
+ elif int(self.fix_refpoints_hw) == -2:
293
+ print('learn a shared h and w')
294
+ assert self.random_refpoints_xy
295
+ self.refpoint_embed = nn.Embedding(use_num_queries, 2)
296
+ self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
297
+ self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
298
+ self.refpoint_embed.weight.data[:, :2].requires_grad = False
299
+ self.hw_embed = nn.Embedding(1, 1)
300
+ else:
301
+ raise NotImplementedError('Unknown fix_refpoints_hw {}'.format(self.fix_refpoints_hw))
302
+
303
+ def forward(self, samples: NestedTensor, targets: List = None, **kw):
304
+ """ The forward expects a NestedTensor, which consists of:
305
+ - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
306
+ - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
307
+
308
+ It returns a dict with the following elements:
309
+ - "pred_logits": the classification logits (including no-object) for all queries.
310
+ Shape= [batch_size x num_queries x num_classes]
311
+ - "pred_boxes": The normalized boxes coordinates for all queries, represented as
312
+ (center_x, center_y, width, height). These values are normalized in [0, 1],
313
+ relative to the size of each individual image (disregarding possible padding).
314
+ See PostProcess for information on how to retrieve the unnormalized bounding box.
315
+ - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
316
+ dictionnaries containing the two above keys for each decoder layer.
317
+ """
318
+
319
+ captions = [t['instance_text_prompt'] for t in targets]
320
+ bs=len(captions)
321
+ tensor_list = [tgt["object_embeddings_text"] for tgt in targets]
322
+ max_size = 350
323
+ padded_tensors = [torch.cat([tensor, torch.zeros(max_size - tensor.size(0), tensor.size(1),device=tensor.device)]) if tensor.size(0) < max_size else tensor for tensor in tensor_list]
324
+ object_embeddings_text = torch.stack(padded_tensors)
325
+
326
+ kpts_embeddings_text = torch.stack([tgt["kpts_embeddings_text"] for tgt in targets])[:, :self.num_body_points]
327
+ encoded_text=self.projection(object_embeddings_text) # bs, 81, 101, 256
328
+ kpt_embeddings_specific=self.projection_kpt(kpts_embeddings_text) # bs, 81, 101, 256
329
+
330
+
331
+ kpt_vis = torch.stack([tgt["kpt_vis_text"] for tgt in targets])[:, :self.num_body_points]
332
+ kpt_mask = torch.cat((torch.ones_like(kpt_vis, device=kpt_vis.device)[..., 0].unsqueeze(-1), kpt_vis), dim=-1)
333
+
334
+
335
+ num_classes = encoded_text.shape[1] # bs, 81, 101, 256
336
+ text_self_attention_masks = torch.eye(num_classes).unsqueeze(0).expand(bs, -1, -1).bool().to(samples.device)
337
+ text_token_mask = torch.zeros(samples.shape[0],num_classes).to(samples.device)>0
338
+ for i in range(bs):
339
+ text_token_mask[i,:len(captions[i])]=True
340
+
341
+ position_ids = torch.zeros(samples.shape[0], num_classes).to(samples.device)
342
+
343
+ for i in range(bs):
344
+ position_ids[i,:len(captions[i])]= 1
345
+
346
+
347
+ text_dict = {
348
+ 'encoded_text': encoded_text, # bs, 195, d_model
349
+ 'text_token_mask': text_token_mask, # bs, 195
350
+ 'position_ids': position_ids, # bs, 195
351
+ 'text_self_attention_masks': text_self_attention_masks # bs, 195,195
352
+ }
353
+
354
+
355
+ # import ipdb; ipdb.set_trace()
356
+
357
+ if isinstance(samples, (list, torch.Tensor)):
358
+ samples = nested_tensor_from_tensor_list(samples)
359
+ features, poss = self.backbone(samples)
360
+ if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
361
+ import ipdb;
362
+ ipdb.set_trace()
363
+
364
+
365
+ srcs = []
366
+ masks = []
367
+ for l, feat in enumerate(features):
368
+ src, mask = feat.decompose()
369
+ srcs.append(self.input_proj[l](src))
370
+ masks.append(mask)
371
+ assert mask is not None
372
+
373
+ if self.num_feature_levels > len(srcs):
374
+ _len_srcs = len(srcs)
375
+ for l in range(_len_srcs, self.num_feature_levels):
376
+ if l == _len_srcs:
377
+ src = self.input_proj[l](features[-1].tensors)
378
+ else:
379
+ src = self.input_proj[l](srcs[-1])
380
+ m = samples.mask
381
+ mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
382
+ pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
383
+ srcs.append(src)
384
+ masks.append(mask)
385
+ poss.append(pos_l)
386
+
387
+ if self.label_enc is not None:
388
+ label_enc = self.label_enc
389
+ else:
390
+ raise NotImplementedError
391
+ label_enc = encoded_text
392
+ if self.dn_number > 0 or targets is not None:
393
+ input_query_label, input_query_bbox, attn_mask, attn_mask2, dn_meta = \
394
+ prepare_for_mask(kpt_mask=kpt_mask)
395
+ else:
396
+ assert targets is None
397
+ input_query_bbox = input_query_label = attn_mask = attn_mask2 = dn_meta = None
398
+
399
+
400
+ hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(srcs, masks, input_query_bbox, poss,
401
+ input_query_label, attn_mask, attn_mask2,
402
+ text_dict, dn_meta,targets,kpt_embeddings_specific)
403
+
404
+ # In case num object=0
405
+ if self.label_enc is not None:
406
+ hs[0] += self.label_enc.weight[0, 0] * 0.0
407
+
408
+ hs[0] += self.pos_proj.weight[0, 0] * 0.0
409
+ hs[0] += self.pos_proj.bias[0] * 0.0
410
+ hs[0] += self.padding.weight[0, 0] * 0.0
411
+
412
+ num_group = 50
413
+ effective_dn_number = dn_meta['pad_size'] if self.training else 0
414
+ outputs_coord_list = []
415
+ outputs_class = []
416
+
417
+
418
+ for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_cls_embed, layer_hs) in enumerate(
419
+ zip(reference[:-1], self.bbox_embed, self.class_embed, hs)):
420
+
421
+
422
+ if dec_lid < self.num_box_decoder_layers:
423
+ layer_delta_unsig = layer_bbox_embed(layer_hs)
424
+ layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
425
+ layer_outputs_unsig = layer_outputs_unsig.sigmoid()
426
+ layer_cls = layer_cls_embed(layer_hs, text_dict)
427
+ outputs_coord_list.append(layer_outputs_unsig)
428
+ outputs_class.append(layer_cls)
429
+
430
+
431
+ else:
432
+
433
+ layer_hs_bbox_dn = layer_hs[:, :effective_dn_number, :]
434
+ layer_hs_bbox_norm = layer_hs[:, effective_dn_number:, :][:, 0::(self.num_body_points + 1), :]
435
+ bs = layer_ref_sig.shape[0]
436
+ reference_before_sigmoid_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]
437
+ reference_before_sigmoid_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][:,
438
+ 0::(self.num_body_points + 1), :]
439
+ layer_delta_unsig_dn = layer_bbox_embed(layer_hs_bbox_dn)
440
+ layer_delta_unsig_norm = layer_bbox_embed(layer_hs_bbox_norm)
441
+ layer_outputs_unsig_dn = layer_delta_unsig_dn + inverse_sigmoid(reference_before_sigmoid_bbox_dn)
442
+ layer_outputs_unsig_dn = layer_outputs_unsig_dn.sigmoid()
443
+ layer_outputs_unsig_norm = layer_delta_unsig_norm + inverse_sigmoid(reference_before_sigmoid_bbox_norm)
444
+ layer_outputs_unsig_norm = layer_outputs_unsig_norm.sigmoid()
445
+ layer_outputs_unsig = torch.cat((layer_outputs_unsig_dn, layer_outputs_unsig_norm), dim=1)
446
+ layer_cls_dn = layer_cls_embed(layer_hs_bbox_dn, text_dict)
447
+ layer_cls_norm = layer_cls_embed(layer_hs_bbox_norm, text_dict)
448
+ layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
449
+ outputs_class.append(layer_cls)
450
+ outputs_coord_list.append(layer_outputs_unsig)
451
+
452
+ # update keypoints
453
+ outputs_keypoints_list = []
454
+ outputs_keypoints_hw = []
455
+ kpt_index = [x for x in range(num_group * (self.num_body_points + 1)) if x % (self.num_body_points + 1) != 0]
456
+ for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
457
+ if dec_lid < self.num_box_decoder_layers:
458
+ assert isinstance(layer_hs, torch.Tensor)
459
+ bs = layer_hs.shape[0]
460
+ layer_res = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points * 3))
461
+ outputs_keypoints_list.append(layer_res)
462
+ else:
463
+ bs = layer_ref_sig.shape[0]
464
+ layer_hs_kpt = layer_hs[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
465
+ device=layer_hs.device))
466
+ delta_xy_unsig = self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_kpt)
467
+ layer_ref_sig_kpt = layer_ref_sig[:, effective_dn_number:, :].index_select(1, torch.tensor(kpt_index,
468
+ device=layer_hs.device))
469
+ layer_outputs_unsig_keypoints = delta_xy_unsig + inverse_sigmoid(layer_ref_sig_kpt[..., :2])
470
+ vis_xy_unsig = torch.ones_like(layer_outputs_unsig_keypoints,
471
+ device=layer_outputs_unsig_keypoints.device)
472
+ xyv = torch.cat((layer_outputs_unsig_keypoints, vis_xy_unsig[:, :, 0].unsqueeze(-1)), dim=-1)
473
+ xyv = xyv.sigmoid()
474
+ layer_res = xyv.reshape((bs, num_group, self.num_body_points, 3)).flatten(2, 3)
475
+ layer_hw = layer_ref_sig_kpt[..., 2:].reshape(bs, num_group, self.num_body_points, 2).flatten(2, 3)
476
+ layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
477
+ outputs_keypoints_list.append(layer_res)
478
+ outputs_keypoints_hw.append(layer_hw)
479
+
480
+
481
+ if self.dn_number > 0 and dn_meta is not None:
482
+ outputs_class, outputs_coord_list = \
483
+ post_process(outputs_class, outputs_coord_list,
484
+ dn_meta, self.aux_loss, self._set_aux_loss)
485
+ out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord_list[-1],
486
+ 'pred_keypoints': outputs_keypoints_list[-1]}
487
+
488
+ return out
489
+
490
+
491
+ @MODULE_BUILD_FUNCS.registe_with_name(module_name='UniPose')
492
+ def build_unipose(args):
493
+
494
+ num_classes = args.num_classes
495
+ device = torch.device(args.device)
496
+
497
+ backbone = build_backbone(args)
498
+
499
+ transformer = build_deformable_transformer(args)
500
+
501
+ try:
502
+ match_unstable_error = args.match_unstable_error
503
+ dn_labelbook_size = args.dn_labelbook_size
504
+ except:
505
+ match_unstable_error = True
506
+ dn_labelbook_size = num_classes
507
+
508
+ try:
509
+ dec_pred_class_embed_share = args.dec_pred_class_embed_share
510
+ except:
511
+ dec_pred_class_embed_share = True
512
+ try:
513
+ dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
514
+ except:
515
+ dec_pred_bbox_embed_share = True
516
+
517
+ binary_query_selection = False
518
+ try:
519
+ binary_query_selection = args.binary_query_selection
520
+ except:
521
+ binary_query_selection = False
522
+
523
+ use_cdn = True
524
+ try:
525
+ use_cdn = args.use_cdn
526
+ except:
527
+ use_cdn = True
528
+
529
+ sub_sentence_present = True
530
+ try:
531
+ sub_sentence_present = args.sub_sentence_present
532
+ except:
533
+ sub_sentence_present = True
534
+ # print('********* sub_sentence_present', sub_sentence_present)
535
+
536
+ model = UniPose(
537
+ backbone,
538
+ transformer,
539
+ num_classes=num_classes,
540
+ num_queries=args.num_queries,
541
+ aux_loss=True,
542
+ iter_update=True,
543
+ query_dim=4,
544
+ random_refpoints_xy=args.random_refpoints_xy,
545
+ fix_refpoints_hw=args.fix_refpoints_hw,
546
+ num_feature_levels=args.num_feature_levels,
547
+ nheads=args.nheads,
548
+ dec_pred_class_embed_share=dec_pred_class_embed_share,
549
+ dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,
550
+ # two stage
551
+ two_stage_type=args.two_stage_type,
552
+ # box_share
553
+ two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,
554
+ two_stage_class_embed_share=args.two_stage_class_embed_share,
555
+ decoder_sa_type=args.decoder_sa_type,
556
+ num_patterns=args.num_patterns,
557
+ dn_number=args.dn_number if args.use_dn else 0,
558
+ dn_box_noise_scale=args.dn_box_noise_scale,
559
+ dn_label_noise_ratio=args.dn_label_noise_ratio,
560
+ dn_labelbook_size=dn_labelbook_size,
561
+ use_label_enc=args.use_label_enc,
562
+
563
+ text_encoder_type=args.text_encoder_type,
564
+
565
+ binary_query_selection=binary_query_selection,
566
+ use_cdn=use_cdn,
567
+ sub_sentence_present=sub_sentence_present
568
+ )
569
+
570
+ return model
571
+
572
+
573
+ class ContrastiveAssign(nn.Module):
574
+ def __init__(self, project=False, cal_bias=None, max_text_len=256):
575
+ """
576
+ :param x: query
577
+ :param y: text embed
578
+ :param proj:
579
+ :return:
580
+ """
581
+ super().__init__()
582
+ self.project = project
583
+ self.cal_bias = cal_bias
584
+ self.max_text_len = max_text_len
585
+
586
+ def forward(self, x, text_dict):
587
+ """_summary_
588
+
589
+ Args:
590
+ x (_type_): _description_
591
+ text_dict (_type_): _description_
592
+ {
593
+ 'encoded_text': encoded_text, # bs, 195, d_model
594
+ 'text_token_mask': text_token_mask, # bs, 195
595
+ # True for used tokens. False for padding tokens
596
+ }
597
+ Returns:
598
+ _type_: _description_
599
+ """
600
+ assert isinstance(text_dict, dict)
601
+
602
+ y = text_dict['encoded_text']
603
+
604
+
605
+ max_text_len = y.shape[1]
606
+
607
+
608
+
609
+ text_token_mask = text_dict['text_token_mask']
610
+
611
+ if self.cal_bias is not None:
612
+ raise NotImplementedError
613
+ return x @ y.transpose(-1, -2) + self.cal_bias.weight.repeat(x.shape[0], x.shape[1], 1)
614
+ res = x @ y.transpose(-1, -2)
615
+ res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
616
+
617
+ # padding to max_text_len
618
+ new_res = torch.full((*res.shape[:-1], max_text_len), float('-inf'), device=res.device)
619
+ new_res[..., :res.shape[-1]] = res
620
+
621
+ return new_res
python/utils/dependencies/XPose/models/UniPose/utils.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+
7
+ import copy
8
+ import torch
9
+ import random
10
+ from torch import nn, Tensor
11
+ import os
12
+ import numpy as np
13
+ import math
14
+ import torch.nn.functional as F
15
+ from torch import nn
16
+
17
+
18
+ def _get_clones(module, N, layer_share=False):
19
+ # import ipdb; ipdb.set_trace()
20
+ if layer_share:
21
+ return nn.ModuleList([module for i in range(N)])
22
+ else:
23
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
24
+
25
+
26
+ def get_sine_pos_embed(
27
+ pos_tensor: torch.Tensor,
28
+ num_pos_feats: int = 128,
29
+ temperature: int = 10000,
30
+ exchange_xy: bool = True,
31
+ ):
32
+ """generate sine position embedding from a position tensor
33
+ Args:
34
+ pos_tensor (torch.Tensor): shape: [..., n].
35
+ num_pos_feats (int): projected shape for each float in the tensor.
36
+ temperature (int): temperature in the sine/cosine function.
37
+ exchange_xy (bool, optional): exchange pos x and pos y. \
38
+ For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
39
+ Returns:
40
+ pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
41
+ """
42
+ scale = 2 * math.pi
43
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
44
+ dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
45
+
46
+ def sine_func(x: torch.Tensor):
47
+ sin_x = x * scale / dim_t
48
+ sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
49
+ return sin_x
50
+
51
+ pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
52
+ if exchange_xy:
53
+ pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
54
+ pos_res = torch.cat(pos_res, dim=-1)
55
+ return pos_res
56
+
57
+
58
+ def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None):
59
+ """
60
+ Input:
61
+ - memory: bs, \sum{hw}, d_model
62
+ - memory_padding_mask: bs, \sum{hw}
63
+ - spatial_shapes: nlevel, 2
64
+ - learnedwh: 2
65
+ Output:
66
+ - output_memory: bs, \sum{hw}, d_model
67
+ - output_proposals: bs, \sum{hw}, 4
68
+ """
69
+ N_, S_, C_ = memory.shape
70
+ base_scale = 4.0
71
+ proposals = []
72
+ _cur = 0
73
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
74
+ mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
75
+ valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
76
+ valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
77
+
78
+ # import ipdb; ipdb.set_trace()
79
+
80
+ grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
81
+ torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
82
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
83
+
84
+ scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
85
+ grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
86
+
87
+ if learnedwh is not None:
88
+ # import ipdb; ipdb.set_trace()
89
+ wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0 ** lvl)
90
+ else:
91
+ wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
92
+
93
+ # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
94
+ # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
95
+ # wh = torch.ones_like(grid) / scale
96
+ proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
97
+ proposals.append(proposal)
98
+ _cur += (H_ * W_)
99
+ # import ipdb; ipdb.set_trace()
100
+ output_proposals = torch.cat(proposals, 1)
101
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
102
+ output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
103
+ output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
104
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
105
+
106
+ output_memory = memory
107
+ output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
108
+ output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
109
+
110
+ # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
111
+ # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
112
+
113
+ return output_memory, output_proposals
114
+
115
+
116
+ class RandomBoxPerturber():
117
+ def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None:
118
+ self.noise_scale = torch.Tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
119
+
120
+ def __call__(self, refanchors: Tensor) -> Tensor:
121
+ nq, bs, query_dim = refanchors.shape
122
+ device = refanchors.device
123
+
124
+ noise_raw = torch.rand_like(refanchors)
125
+ noise_scale = self.noise_scale.to(device)[:query_dim]
126
+
127
+ new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
128
+ return new_refanchors.clamp_(0, 1)
129
+
130
+
131
+ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False):
132
+ """
133
+ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
134
+ Args:
135
+ inputs: A float tensor of arbitrary shape.
136
+ The predictions for each example.
137
+ targets: A float tensor with the same shape as inputs. Stores the binary
138
+ classification label for each element in inputs
139
+ (0 for the negative class and 1 for the positive class).
140
+ alpha: (optional) Weighting factor in range (0,1) to balance
141
+ positive vs negative examples. Default = -1 (no weighting).
142
+ gamma: Exponent of the modulating factor (1 - p_t) to
143
+ balance easy vs hard examples.
144
+ Returns:
145
+ Loss tensor
146
+ """
147
+ prob = inputs.sigmoid()
148
+ ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
149
+ p_t = prob * targets + (1 - prob) * (1 - targets)
150
+ loss = ce_loss * ((1 - p_t) ** gamma)
151
+
152
+ if alpha >= 0:
153
+ alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
154
+ loss = alpha_t * loss
155
+
156
+ if no_reduction:
157
+ return loss
158
+
159
+ return loss.mean(1).sum() / num_boxes
160
+
161
+
162
+ class MLP(nn.Module):
163
+ """ Very simple multi-layer perceptron (also called FFN)"""
164
+
165
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
166
+ super().__init__()
167
+ self.num_layers = num_layers
168
+ h = [hidden_dim] * (num_layers - 1)
169
+ self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
170
+
171
+ def forward(self, x):
172
+ for i, layer in enumerate(self.layers):
173
+ x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
174
+ return x
175
+
176
+
177
+ def _get_activation_fn(activation, d_model=256, batch_dim=0):
178
+ """Return an activation function given a string"""
179
+ if activation == "relu":
180
+ return F.relu
181
+ if activation == "gelu":
182
+ return F.gelu
183
+ if activation == "glu":
184
+ return F.glu
185
+ if activation == "prelu":
186
+ return nn.PReLU()
187
+ if activation == "selu":
188
+ return F.selu
189
+
190
+ raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
191
+
192
+
193
+ def gen_sineembed_for_position(pos_tensor):
194
+ # n_query, bs, _ = pos_tensor.size()
195
+ # sineembed_tensor = torch.zeros(n_query, bs, 256)
196
+ scale = 2 * math.pi
197
+ dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
198
+ dim_t = 10000 ** (2 * (dim_t // 2) / 128)
199
+ x_embed = pos_tensor[:, :, 0] * scale
200
+ y_embed = pos_tensor[:, :, 1] * scale
201
+ pos_x = x_embed[:, :, None] / dim_t
202
+ pos_y = y_embed[:, :, None] / dim_t
203
+ pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
204
+ pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
205
+ if pos_tensor.size(-1) == 2:
206
+ pos = torch.cat((pos_y, pos_x), dim=2)
207
+ elif pos_tensor.size(-1) == 4:
208
+ w_embed = pos_tensor[:, :, 2] * scale
209
+ pos_w = w_embed[:, :, None] / dim_t
210
+ pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
211
+
212
+ h_embed = pos_tensor[:, :, 3] * scale
213
+ pos_h = h_embed[:, :, None] / dim_t
214
+ pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
215
+
216
+ pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
217
+ else:
218
+ raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
219
+ return pos
220
+
221
+
222
+ def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
223
+ sigmas = kpt_preds.new_tensor(sigmas)
224
+ variances = (sigmas * 2) ** 2
225
+
226
+ assert kpt_preds.size(0) == kpt_gts.size(0)
227
+ kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
228
+ kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
229
+
230
+ squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
231
+ (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
232
+ # import pdb
233
+ # pdb.set_trace()
234
+ # assert (kpt_valids.sum(-1) > 0).all()
235
+ squared_distance0 = squared_distance / (kpt_areas[:, None] * variances[None, :] * 2)
236
+ squared_distance1 = torch.exp(-squared_distance0)
237
+ squared_distance1 = squared_distance1 * kpt_valids
238
+ oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)
239
+
240
+ return oks
241
+
242
+
243
+ def oks_loss(pred,
244
+ target,
245
+ valid=None,
246
+ area=None,
247
+ linear=False,
248
+ sigmas=None,
249
+ eps=1e-6):
250
+ """Oks loss.
251
+ Computing the oks loss between a set of predicted poses and target poses.
252
+ The loss is calculated as negative log of oks.
253
+ Args:
254
+ pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
255
+ shape (n, 2K).
256
+ target (torch.Tensor): Corresponding gt poses, shape (n, 2K).
257
+ linear (bool, optional): If True, use linear scale of loss instead of
258
+ log scale. Default: False.
259
+ eps (float): Eps to avoid log(0).
260
+ Return:
261
+ torch.Tensor: Loss tensor.
262
+ """
263
+ oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
264
+ if linear:
265
+ loss = 1 - oks
266
+ else:
267
+ loss = -oks.log()
268
+ return loss
269
+
270
+
271
+ class OKSLoss(nn.Module):
272
+ """IoULoss.
273
+ Computing the oks loss between a set of predicted poses and target poses.
274
+ Args:
275
+ linear (bool): If True, use linear scale of loss instead of log scale.
276
+ Default: False.
277
+ eps (float): Eps to avoid log(0).
278
+ reduction (str): Options are "none", "mean" and "sum".
279
+ loss_weight (float): Weight of loss.
280
+ """
281
+
282
+ def __init__(self,
283
+ linear=False,
284
+ num_keypoints=17,
285
+ eps=1e-6,
286
+ reduction='mean',
287
+ loss_weight=1.0):
288
+ super(OKSLoss, self).__init__()
289
+ self.linear = linear
290
+ self.eps = eps
291
+ self.reduction = reduction
292
+ self.loss_weight = loss_weight
293
+ if num_keypoints == 68:
294
+ self.sigmas = np.array([
295
+ .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
296
+ 1.07, .87, .87, .89, .89, .25, .25, .25, .25, .25, .25, .25, .25,
297
+ .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
298
+ .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
299
+ .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25, .25,
300
+ ], dtype=np.float32) / 10.0
301
+ else:
302
+ raise ValueError(f'Unsupported keypoints number {num_keypoints}')
303
+
304
+ def forward(self,
305
+ pred,
306
+ target,
307
+ valid,
308
+ area,
309
+ weight=None,
310
+ avg_factor=None,
311
+ reduction_override=None):
312
+ """Forward function.
313
+ Args:
314
+ pred (torch.Tensor): The prediction.
315
+ target (torch.Tensor): The learning target of the prediction.
316
+ valid (torch.Tensor): The visible flag of the target pose.
317
+ area (torch.Tensor): The area of the target pose.
318
+ weight (torch.Tensor, optional): The weight of loss for each
319
+ prediction. Defaults to None.
320
+ avg_factor (int, optional): Average factor that is used to average
321
+ the loss. Defaults to None.
322
+ reduction_override (str, optional): The reduction method used to
323
+ override the original reduction method of the loss.
324
+ Defaults to None. Options are "none", "mean" and "sum".
325
+ """
326
+ assert reduction_override in (None, 'none', 'mean', 'sum')
327
+ reduction = (
328
+ reduction_override if reduction_override else self.reduction)
329
+ if (weight is not None) and (not torch.any(weight > 0)) and (
330
+ reduction != 'none'):
331
+ if pred.dim() == weight.dim() + 1:
332
+ weight = weight.unsqueeze(1)
333
+ return (pred * weight).sum() # 0
334
+ if weight is not None and weight.dim() > 1:
335
+ # TODO: remove this in the future
336
+ # reduce the weight of shape (n, 4) to (n,) to match the
337
+ # iou_loss of shape (n,)
338
+ assert weight.shape == pred.shape
339
+ weight = weight.mean(-1)
340
+ loss = self.loss_weight * oks_loss(
341
+ pred,
342
+ target,
343
+ valid=valid,
344
+ area=area,
345
+ linear=self.linear,
346
+ sigmas=self.sigmas,
347
+ eps=self.eps)
348
+ return loss
python/utils/dependencies/XPose/models/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # ED-Pose
3
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
7
+ from .UniPose.unipose import build_unipose
8
+
9
+ def build_model(args):
10
+ # we use register to maintain models from catdet6 on.
11
+ from .registry import MODULE_BUILD_FUNCS
12
+
13
+ assert args.modelname in MODULE_BUILD_FUNCS._module_dict
14
+ build_func = MODULE_BUILD_FUNCS.get(args.modelname)
15
+ model = build_func(args)
16
+ return model
python/utils/dependencies/XPose/models/registry.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # @Author: Yihao Chen
3
+ # @Date: 2021-08-16 16:03:17
4
+ # @Last Modified by: Shilong Liu
5
+ # @Last Modified time: 2022-01-23 15:26
6
+ # modified from mmcv
7
+
8
+ import inspect
9
+ from functools import partial
10
+
11
+
12
+ class Registry(object):
13
+
14
+ def __init__(self, name):
15
+ self._name = name
16
+ self._module_dict = dict()
17
+
18
+ def __repr__(self):
19
+ format_str = self.__class__.__name__ + '(name={}, items={})'.format(
20
+ self._name, list(self._module_dict.keys()))
21
+ return format_str
22
+
23
+ def __len__(self):
24
+ return len(self._module_dict)
25
+
26
+ @property
27
+ def name(self):
28
+ return self._name
29
+
30
+ @property
31
+ def module_dict(self):
32
+ return self._module_dict
33
+
34
+ def get(self, key):
35
+ return self._module_dict.get(key, None)
36
+
37
+ def registe_with_name(self, module_name=None, force=False):
38
+ return partial(self.register, module_name=module_name, force=force)
39
+
40
+ def register(self, module_build_function, module_name=None, force=False):
41
+ """Register a module build function.
42
+ Args:
43
+ module (:obj:`nn.Module`): Module to be registered.
44
+ """
45
+ if not inspect.isfunction(module_build_function):
46
+ raise TypeError('module_build_function must be a function, but got {}'.format(
47
+ type(module_build_function)))
48
+ if module_name is None:
49
+ module_name = module_build_function.__name__
50
+ if not force and module_name in self._module_dict:
51
+ raise KeyError('{} is already registered in {}'.format(
52
+ module_name, self.name))
53
+ self._module_dict[module_name] = module_build_function
54
+
55
+ return module_build_function
56
+
57
+ MODULE_BUILD_FUNCS = Registry('model build functions')
58
+
python/utils/dependencies/XPose/predefined_keypoints.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ person = {"keypoints":['nose', 'left eye', 'right eye', 'left ear', 'right ear', 'left shoulder', 'right shoulder', 'left elbow', 'right elbow', 'left wrist', 'right wrist', 'left hip', 'right hip', 'left knee', 'right knee', 'left ankle', 'right ankle'],"skeleton": [[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13],[6,7],[6,8],[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]}
2
+
3
+ face = {"keypoints": ['right cheekbone 1', 'right cheekbone 2', 'right cheek 1', 'right cheek 2', 'right cheek 3', 'right cheek 4', 'right cheek 5', 'right chin', 'chin center', 'left chin', 'left cheek 5', 'left cheek 4', 'left cheek 3', 'left cheek 2', 'left cheek 1', 'left cheekbone 2', 'left cheekbone 1', 'right eyebrow 1', 'right eyebrow 2', 'right eyebrow 3', 'right eyebrow 4', 'right eyebrow 5', 'left eyebrow 1', 'left eyebrow 2', 'left eyebrow 3', 'left eyebrow 4', 'left eyebrow 5', 'nasal bridge 1', 'nasal bridge 2', 'nasal bridge 3', 'nasal bridge 4', 'right nasal wing 1', 'right nasal wing 2', 'nasal wing center', 'left nasal wing 1', 'left nasal wing 2', 'right eye eye corner 1', 'right eye upper eyelid 1', 'right eye upper eyelid 2', 'right eye eye corner 2', 'right eye lower eyelid 2', 'right eye lower eyelid 1', 'left eye eye corner 1', 'left eye upper eyelid 1', 'left eye upper eyelid 2', 'left eye eye corner 2', 'left eye lower eyelid 2', 'left eye lower eyelid 1', 'right mouth corner', 'upper lip outer edge 1', 'upper lip outer edge 2', 'upper lip outer edge 3', 'upper lip outer edge 4', 'upper lip outer edge 5', 'left mouth corner', 'lower lip outer edge 5', 'lower lip outer edge 4', 'lower lip outer edge 3', 'lower lip outer edge 2', 'lower lip outer edge 1', 'upper lip inter edge 1', 'upper lip inter edge 2', 'upper lip inter edge 3', 'upper lip inter edge 4', 'upper lip inter edge 5', 'lower lip inter edge 3', 'lower lip inter edge 2', 'lower lip inter edge 1'], "skeleton": []}
4
+
5
+ hand = {"keypoints":['wrist', 'thumb root', "thumb's third knuckle", "thumb's second knuckle", 'thumb’s first knuckle', "forefinger's root", "forefinger's third knuckle", "forefinger's second knuckle", "forefinger's first knuckle", "middle finger's root", "middle finger's third knuckle", "middle finger's second knuckle", "middle finger's first knuckle", "ring finger's root", "ring finger's third knuckle", "ring finger's second knuckle", "ring finger's first knuckle", "pinky finger's root", "pinky finger's third knuckle", "pinky finger's second knuckle", "pinky finger's first knuckle"],"skeleton": []}
6
+
7
+ animal_in_AnimalKindom = {"keypoints":['head mid top', 'eye left', 'eye right', 'mouth front top', 'mouth back left', 'mouth back right', 'mouth front bottom', 'shoulder left', 'shoulder right', 'elbow left', 'elbow right', 'wrist left', 'wrist right', 'torso mid back', 'hip left', 'hip right', 'knee left', 'knee right', 'ankle left ', 'ankle right', 'tail top back', 'tail mid back', 'tail end back'],"skeleton": [[1, 0], [2, 0], [3, 4], [3, 5], [4, 6], [5, 6], [0, 7], [0, 8], [7, 9], [8, 10], [9, 11], [10, 12], [0, 13], [13, 20], [20, 14], [20, 15], [14, 16], [15, 17], [16, 18], [17, 19], [20, 21], [21, 22]]}
8
+
9
+ animal_in_AP10K = {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
10
+
11
+ animal= {"keypoints": ['left eye', 'right eye', 'nose', 'neck', 'root of tail', 'left shoulder', 'left elbow', 'left front paw', 'right shoulder', 'right elbow', 'right front paw', 'left hip', 'left knee', 'left back paw', 'right hip', 'right knee', 'right back paw'], "skeleton": [[1, 2], [1, 3], [2, 3], [3, 4], [4, 5], [4, 6], [6, 7], [7, 8], [4, 9], [9, 10], [10, 11], [5, 12], [12, 13], [13, 14], [5, 15], [15, 16], [16, 17]]}
12
+
13
+ animal_face = {"keypoints": ['right eye right', 'right eye left', 'left eye right', 'left eye left', 'nose tip', 'lip right', 'lip left', 'upper lip', 'lower lip'], "skeleton": []}
14
+
15
+ fly = {"keypoints": ['head', 'eye left', 'eye right', 'neck', 'thorax', 'abdomen', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'wing left', 'wing right'], "skeleton": [[2, 1], [3, 1], [4, 1], [5, 4], [6, 5], [8, 7], [9, 8], [10, 9], [12, 11], [13, 12], [14, 13], [16, 15], [17, 16], [18, 17], [20, 19], [21, 20], [22, 21], [24, 23], [25, 24], [26, 25], [28, 27], [29, 28], [30, 29], [31, 4], [32, 4]]}
16
+
17
+ locust = {"keypoints": ['head', 'neck', 'thorax', 'abdomen1', 'abdomen2', 'anttip left', 'antbase left', 'eye left', 'foreleg left base', 'foreleg left first segment', 'foreleg left second segment', 'foreleg left tip', 'midleg left base', 'midleg left first segment', 'midleg left second segment', 'midleg left tip', 'hindleg left base', 'hindleg left first segment', 'hindleg left second segment', 'hindleg left tip', 'anttip right', 'antbase right', 'eye right', 'foreleg right base', 'foreleg right first segment', 'foreleg right second segment', 'foreleg right tip', 'midleg right base', 'midleg right first segment', 'midleg right second segment', 'midleg right tip', 'hindleg right base', 'hindleg right first segment', 'hindleg right second segment', 'hindleg right tip'],"skeleton": [[2, 1], [3, 2], [4, 3], [5, 4], [7, 6], [8, 7], [10, 9], [11, 10], [12, 11], [14, 13], [15, 14],[16, 15], [18, 17], [19, 18], [20, 19], [22, 21], [23, 22], [25, 24], [26, 25], [27, 26],[29, 28], [30, 29], [31, 30], [33, 32], [34, 33], [35, 34]]}
18
+
19
+ car ={"keypoints": ['right front wheel center', 'left front wheel center', 'right rear wheel center', 'left rear wheel center', 'front right', 'front left', 'back right', 'back left', 'none', 'roof front right', 'roof front left', 'roof back right', 'roof back left', 'none'],"skeleton": [[0, 2], [1, 3], [0, 1], [2, 3], [9, 11], [10, 12], [9, 10], [11, 12], [4, 0], [4, 9], [4, 5], [5, 1], [5, 10], [6, 2], [6, 11], [7, 3], [7, 12], [6, 7]]}
20
+
21
+ short_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
22
+
23
+ long_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'right sleeve inside 3', 'right sleeve inside 4', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 4', 'left sleeve inside 3', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
24
+
25
+ short_sleeved_outwear={'keypoints': ['upper center neckline', 'lower right center neckline', 'lower right neckline', 'upper right neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 2', 'right sleeve inside 1', 'right side outside 1', 'right side outside 2', 'right side outside 3', 'right side inside 3', 'left side outside 3', 'left side outside 2', 'left side outside 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1', 'lower left center neckline', 'left side inside 1', 'left side inside 2', 'left side inside 3', 'right side inside 1', 'right side inside 2'], 'skeleton': []}
26
+
27
+ sling={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
28
+
29
+ vest = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve'], 'skeleton': []}
30
+
31
+ long_sleeved_dress={'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'center hem', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
32
+
33
+ long_sleeved_shirt = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right sleeve outside 3', 'right sleeve outside 4', 'right cuff outside', 'right cuff inside', 'right sleeve inside 4', 'right sleeve inside 3', 'right sleeve inside 2', 'right sleeve inside 1', 'right side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2', 'left side 1', 'left sleeve inside 1', 'left sleeve inside 2', 'left sleeve inside 3', 'left sleeve inside 4', 'left cuff inside', 'left cuff outside', 'left sleeve outside 4', 'left sleeve outside 3', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
34
+
35
+ trousers = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right side outside 3', 'right cuff outside', 'right cuff inside', 'right side inside 1', 'crotch', 'left side inside 1', 'left cuff inside', 'left cuff outside', 'left side outside 3', 'left side outside 2'], 'skeleton': []}
36
+
37
+ sling_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
38
+
39
+ vest_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right side 1', 'right side 2', 'right side 3', 'right side 4', 'right side 5', 'right side 6', 'center hem', 'left side 6', 'left side 5', 'left side 4', 'left side 3', 'left side 2', 'left side 1'], 'skeleton': []}
40
+
41
+ skirt = {'keypoints': ['right side 1', 'upper center', 'left side 1', 'right side 2', 'right side 3', 'center hem', 'left side 3', 'left side 2'], 'skeleton': []}
42
+
43
+ short_sleeved_dress = {'keypoints': ['upper center neckline', 'upper right neckline', 'lower right neckline', 'lower center neckline', 'lower left neckline', 'upper left neckline', 'right sleeve outside 1', 'right sleeve outside 2', 'right cuff outside', 'right cuff inside', 'right sleeve inside 1', 'right sleeve inside 2', 'left side 1', 'left side 2', 'left side 3', 'left side 4', 'left side 5', 'center hem', 'right side 5', 'right side 4', 'right side 3', 'right side 2', 'right side 1', 'left sleeve inside 2', 'left sleeve inside 1', 'left cuff inside', 'left cuff outside', 'left sleeve outside 2', 'left sleeve outside 1'], 'skeleton': []}
44
+
45
+ shorts = {'keypoints': ['right side outside 1', 'upper center', 'left side outside 1', 'right side outside 2', 'right cuff outside', 'right cuff inside', 'crotch', 'left cuff inside', 'left cuff outside', 'left side outside 2'], 'skeleton': []}
46
+
47
+ table = {'keypoints': ['desktop corner 1', 'desktop corner 2', 'desktop corner 3', 'desktop corner 4', 'table leg 1', 'table leg 2', 'table leg 3', 'table leg 4'], 'skeleton': []}
48
+
49
+ chair = {'keypoints': ['legs righttopcorner', 'legs lefttopcorner', 'legs leftbottomcorner', 'legs rightbottomcorner', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'headboard righttop', 'headboard lefttop'], 'skeleton': []}
50
+
51
+ bed = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'backrest lefttop'], 'skeleton': []}
52
+
53
+ sofa = {'keypoints': ['legs rightbottomcorner', 'legs righttopcorner', 'base rightbottom', 'base righttop', 'armrests rightbottomcorner', 'armrests righttopcorner', 'backrest righttop', 'legs leftbottomcorner', 'legs lefttopcorner', 'base leftbottom', 'base lefttop', 'armrests leftbottomcorner', 'armrests lefttopcorner', 'backrest lefttop'], 'skeleton': []}
54
+
55
+ swivelchair = {'keypoints': ['rotatingbase 1', 'rotatingbase 2', 'rotatingbase 3', 'rotatingbase 4', 'rotatingbase 5', 'rotatingbase center', 'base center', 'base righttop', 'base lefttop', 'base leftbottom', 'base rightbottom', 'backrest righttop', 'backrest lefttop'], 'skeleton': []}
56
+
python/utils/dependencies/XPose/transforms.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Transforms and data augmentation for both image + bbox.
4
+ """
5
+ import os
6
+ import sys
7
+ import random
8
+
9
+ import PIL
10
+ import torch
11
+ import torchvision.transforms as T
12
+ import torchvision.transforms.functional as F
13
+
14
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
15
+ from util.box_ops import box_xyxy_to_cxcywh
16
+ from util.misc import interpolate
17
+
18
+
19
+ def crop(image, target, region):
20
+ cropped_image = F.crop(image, *region)
21
+
22
+ if target is not None:
23
+ target = target.copy()
24
+ i, j, h, w = region
25
+ id2catname = target["id2catname"]
26
+ caption_list = target["caption_list"]
27
+ target["size"] = torch.tensor([h, w])
28
+
29
+ fields = ["labels", "area", "iscrowd", "positive_map","keypoints"]
30
+
31
+ if "boxes" in target:
32
+ boxes = target["boxes"]
33
+ max_size = torch.as_tensor([w, h], dtype=torch.float32)
34
+ cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
35
+ cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
36
+ cropped_boxes = cropped_boxes.clamp(min=0)
37
+ area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
38
+ target["boxes"] = cropped_boxes.reshape(-1, 4)
39
+ target["area"] = area
40
+ fields.append("boxes")
41
+
42
+ if "masks" in target:
43
+ # FIXME should we update the area here if there are no boxes?
44
+ target['masks'] = target['masks'][:, i:i + h, j:j + w]
45
+ fields.append("masks")
46
+
47
+
48
+ # remove elements for which the boxes or masks that have zero area
49
+ if "boxes" in target or "masks" in target:
50
+ # favor boxes selection when defining which elements to keep
51
+ # this is compatible with previous implementation
52
+ if "boxes" in target:
53
+ cropped_boxes = target['boxes'].reshape(-1, 2, 2)
54
+ keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
55
+ else:
56
+ keep = target['masks'].flatten(1).any(1)
57
+
58
+ for field in fields:
59
+ if field in target:
60
+ target[field] = target[field][keep]
61
+
62
+ if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
63
+ # for debug and visualization only.
64
+ if 'strings_positive' in target:
65
+ target['strings_positive'] = [_i for _i, _j in zip(target['strings_positive'], keep) if _j]
66
+
67
+
68
+ if "keypoints" in target:
69
+ max_size = torch.as_tensor([w, h], dtype=torch.float32)
70
+ keypoints = target["keypoints"]
71
+ cropped_keypoints = keypoints.view(-1, 3)[:,:2] - torch.as_tensor([j, i])
72
+ cropped_keypoints = torch.min(cropped_keypoints, max_size)
73
+ cropped_keypoints = cropped_keypoints.clamp(min=0)
74
+ cropped_keypoints = torch.cat([cropped_keypoints, keypoints.view(-1, 3)[:,2].unsqueeze(1)], dim=1)
75
+ target["keypoints"] = cropped_keypoints.view(target["keypoints"].shape[0], target["keypoints"].shape[1], 3)
76
+
77
+ target["id2catname"] = id2catname
78
+ target["caption_list"] = caption_list
79
+
80
+ return cropped_image, target
81
+
82
+
83
+ def hflip(image, target):
84
+ flipped_image = F.hflip(image)
85
+
86
+ w, h = image.size
87
+
88
+ if target is not None:
89
+ target = target.copy()
90
+ if "boxes" in target:
91
+ boxes = target["boxes"]
92
+ boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
93
+ target["boxes"] = boxes
94
+
95
+ if "masks" in target:
96
+ target['masks'] = target['masks'].flip(-1)
97
+
98
+
99
+ if "keypoints" in target:
100
+ dataset_name=target["dataset_name"]
101
+ if dataset_name == "coco_person" or dataset_name == "macaque":
102
+ flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8],
103
+ [9, 10], [11, 12], [13, 14], [15, 16]]
104
+
105
+ elif dataset_name=="animalkindom_ak_P1_animal":
106
+ flip_pairs = [[1, 2], [4, 5],[7,8],[9,10],[11,12],[14,15],[16,17],[18,19]]
107
+
108
+ elif dataset_name=="animalweb_animal":
109
+ flip_pairs = [[0, 3], [1, 2], [5, 6]]
110
+
111
+ elif dataset_name=="face":
112
+ flip_pairs = [
113
+ [0, 16], [1, 15], [2, 14], [3, 13], [4, 12], [5, 11], [6, 10], [7, 9],
114
+ [17, 26], [18, 25], [19, 24], [20, 23], [21, 22],
115
+ [31, 35], [32, 34],
116
+ [36, 45], [37, 44], [38, 43], [39, 42], [40, 47], [41, 46],
117
+ [48, 54], [49, 53], [50, 52],
118
+ [55, 59], [56, 58],
119
+ [60, 64], [61, 63],
120
+ [65, 67]
121
+ ]
122
+
123
+ elif dataset_name=="hand":
124
+ flip_pairs = []
125
+
126
+ elif dataset_name=="foot":
127
+ flip_pairs = []
128
+
129
+ elif dataset_name=="locust":
130
+ flip_pairs = [[5, 20], [6, 21], [7, 22], [8, 23], [9, 24], [10, 25], [11, 26], [12, 27], [13, 28], [14, 29], [15, 30], [16, 31], [17, 32], [18, 33], [19, 34]]
131
+
132
+ elif dataset_name=="fly":
133
+ flip_pairs = [[1, 2], [6, 18], [7, 19], [8, 20], [9, 21], [10, 22], [11, 23], [12, 24], [13, 25], [14, 26], [15, 27], [16, 28], [17, 29], [30, 31]]
134
+
135
+ elif dataset_name == "ap_36k_animal" or dataset_name == "ap_10k_animal":
136
+ flip_pairs = [[0, 1],[5, 8], [6, 9], [7, 10], [11, 14], [12, 15], [13, 16]]
137
+
138
+
139
+
140
+ keypoints = target["keypoints"]
141
+ keypoints[:,:,0] = w - keypoints[:,:, 0]-1
142
+ for pair in flip_pairs:
143
+ keypoints[:,pair[0], :], keypoints[:,pair[1], :] = keypoints[:,pair[1], :], keypoints[:,pair[0], :].clone()
144
+ target["keypoints"] = keypoints
145
+ return flipped_image, target
146
+
147
+
148
+ def resize(image, target, size, max_size=None):
149
+ # size can be min_size (scalar) or (w, h) tuple
150
+
151
+ def get_size_with_aspect_ratio(image_size, size, max_size=None):
152
+ w, h = image_size
153
+ if max_size is not None:
154
+ min_original_size = float(min((w, h)))
155
+ max_original_size = float(max((w, h)))
156
+ if max_original_size / min_original_size * size > max_size:
157
+ size = int(round(max_size * min_original_size / max_original_size))
158
+
159
+ if (w <= h and w == size) or (h <= w and h == size):
160
+ return (h, w)
161
+
162
+ if w < h:
163
+ ow = size
164
+ oh = int(size * h / w)
165
+ else:
166
+ oh = size
167
+ ow = int(size * w / h)
168
+
169
+ return (oh, ow)
170
+
171
+ def get_size(image_size, size, max_size=None):
172
+ if isinstance(size, (list, tuple)):
173
+ return size[::-1]
174
+ else:
175
+ return get_size_with_aspect_ratio(image_size, size, max_size)
176
+
177
+ size = get_size(image.size, size, max_size)
178
+ rescaled_image = F.resize(image, size)
179
+
180
+ if target is None:
181
+ return rescaled_image, None
182
+
183
+ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
184
+ ratio_width, ratio_height = ratios
185
+
186
+ target = target.copy()
187
+ if "boxes" in target:
188
+ boxes = target["boxes"]
189
+ scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
190
+ target["boxes"] = scaled_boxes
191
+
192
+ if "area" in target:
193
+ area = target["area"]
194
+ scaled_area = area * (ratio_width * ratio_height)
195
+ target["area"] = scaled_area
196
+
197
+
198
+ if "keypoints" in target:
199
+ keypoints = target["keypoints"]
200
+ scaled_keypoints = keypoints * torch.as_tensor([ratio_width, ratio_height, 1])
201
+ target["keypoints"] = scaled_keypoints
202
+
203
+ h, w = size
204
+ target["size"] = torch.tensor([h, w])
205
+
206
+ if "masks" in target:
207
+ target['masks'] = interpolate(
208
+ target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
209
+
210
+ return rescaled_image, target
211
+
212
+
213
+ def pad(image, target, padding):
214
+ # assumes that we only pad on the bottom right corners
215
+ padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
216
+ if target is None:
217
+ return padded_image, None
218
+ target = target.copy()
219
+ # should we do something wrt the original size?
220
+ target["size"] = torch.tensor(padded_image.size[::-1])
221
+ if "masks" in target:
222
+ target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
223
+ return padded_image, target
224
+
225
+
226
+ class ResizeDebug(object):
227
+ def __init__(self, size):
228
+ self.size = size
229
+
230
+ def __call__(self, img, target):
231
+ return resize(img, target, self.size)
232
+
233
+
234
+ class RandomCrop(object):
235
+ def __init__(self, size):
236
+ self.size = size
237
+
238
+ def __call__(self, img, target):
239
+ region = T.RandomCrop.get_params(img, self.size)
240
+ return crop(img, target, region)
241
+
242
+
243
+ class RandomSizeCrop(object):
244
+ def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
245
+ # respect_boxes: True to keep all boxes
246
+ # False to tolerence box filter
247
+ self.min_size = min_size
248
+ self.max_size = max_size
249
+ self.respect_boxes = respect_boxes
250
+
251
+ def __call__(self, img: PIL.Image.Image, target: dict):
252
+ init_boxes = len(target["boxes"]) if (target is not None and "boxes" in target) else 0
253
+ max_patience = 10
254
+ for i in range(max_patience):
255
+ w = random.randint(self.min_size, min(img.width, self.max_size))
256
+ h = random.randint(self.min_size, min(img.height, self.max_size))
257
+ region = T.RandomCrop.get_params(img, [h, w])
258
+ result_img, result_target = crop(img, target, region)
259
+ if target is not None:
260
+ if not self.respect_boxes or len(result_target["boxes"]) == init_boxes or i == max_patience - 1:
261
+ return result_img, result_target
262
+ return result_img, result_target
263
+
264
+
265
+ class CenterCrop(object):
266
+ def __init__(self, size):
267
+ self.size = size
268
+
269
+ def __call__(self, img, target):
270
+ image_width, image_height = img.size
271
+ crop_height, crop_width = self.size
272
+ crop_top = int(round((image_height - crop_height) / 2.))
273
+ crop_left = int(round((image_width - crop_width) / 2.))
274
+ return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
275
+
276
+
277
+ class RandomHorizontalFlip(object):
278
+ def __init__(self, p=0.5):
279
+ self.p = p
280
+
281
+ def __call__(self, img, target):
282
+ if random.random() < self.p:
283
+ return hflip(img, target)
284
+ return img, target
285
+
286
+
287
+ class RandomResize(object):
288
+ def __init__(self, sizes, max_size=None):
289
+ assert isinstance(sizes, (list, tuple))
290
+ self.sizes = sizes
291
+ self.max_size = max_size
292
+
293
+ def __call__(self, img, target=None):
294
+ size = random.choice(self.sizes)
295
+ return resize(img, target, size, self.max_size)
296
+
297
+
298
+ class RandomPad(object):
299
+ def __init__(self, max_pad):
300
+ self.max_pad = max_pad
301
+
302
+ def __call__(self, img, target):
303
+ pad_x = random.randint(0, self.max_pad)
304
+ pad_y = random.randint(0, self.max_pad)
305
+ return pad(img, target, (pad_x, pad_y))
306
+
307
+
308
+ class RandomSelect(object):
309
+ """
310
+ Randomly selects between transforms1 and transforms2,
311
+ with probability p for transforms1 and (1 - p) for transforms2
312
+ """
313
+ def __init__(self, transforms1, transforms2, p=0.5):
314
+ self.transforms1 = transforms1
315
+ self.transforms2 = transforms2
316
+ self.p = p
317
+
318
+ def __call__(self, img, target):
319
+ if random.random() < self.p:
320
+ return self.transforms1(img, target)
321
+ return self.transforms2(img, target)
322
+
323
+
324
+ class ToTensor(object):
325
+ def __call__(self, img, target):
326
+ return F.to_tensor(img), target
327
+
328
+
329
+ class RandomErasing(object):
330
+
331
+ def __init__(self, *args, **kwargs):
332
+ self.eraser = T.RandomErasing(*args, **kwargs)
333
+
334
+ def __call__(self, img, target):
335
+ return self.eraser(img), target
336
+
337
+
338
+ class Normalize(object):
339
+ def __init__(self, mean, std):
340
+ self.mean = mean
341
+ self.std = std
342
+
343
+ def __call__(self, image, target=None):
344
+ image = F.normalize(image, mean=self.mean, std=self.std)
345
+ if target is None:
346
+ return image, None
347
+ target = target.copy()
348
+ h, w = image.shape[-2:]
349
+ if "boxes" in target:
350
+ boxes = target["boxes"]
351
+ boxes = box_xyxy_to_cxcywh(boxes)
352
+ boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
353
+ target["boxes"] = boxes
354
+
355
+ if "area" in target:
356
+ area = target["area"]
357
+ area = area / (torch.tensor(w, dtype=torch.float32)*torch.tensor(h, dtype=torch.float32))
358
+ target["area"] = area
359
+
360
+ if "keypoints" in target:
361
+ keypoints = target["keypoints"]
362
+ V = keypoints[:, :, 2]
363
+ V[V == 2] = 1
364
+ Z=keypoints[:, :, :2]
365
+ Z = Z.contiguous().view(-1, 2 * V.shape[-1])
366
+ Z = Z / torch.tensor([w, h] * V.shape[-1], dtype=torch.float32)
367
+ target["valid_kpt_num"] = V.shape[1]
368
+ Z_pad = torch.zeros(Z.shape[0],68 * 2 - Z.shape[1])
369
+ V_pad = torch.zeros(V.shape[0],68 - V.shape[1])
370
+ V=torch.cat([V, V_pad], dim=1)
371
+ Z=torch.cat([Z, Z_pad], dim=1)
372
+ all_keypoints = torch.cat([Z, V], dim=1)
373
+ target["keypoints"] = all_keypoints
374
+
375
+
376
+ return image, target
377
+
378
+
379
+ class Compose(object):
380
+ def __init__(self, transforms):
381
+ self.transforms = transforms
382
+
383
+ def __call__(self, image, target):
384
+ for t in self.transforms:
385
+ image, target = t(image, target)
386
+ return image, target
387
+
388
+ def __repr__(self):
389
+ format_string = self.__class__.__name__ + "("
390
+ for t in self.transforms:
391
+ format_string += "\n"
392
+ format_string += " {0}".format(t)
393
+ format_string += "\n)"
394
+ return format_string
python/utils/dependencies/XPose/util/addict.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+
4
+ class Dict(dict):
5
+
6
+ def __init__(__self, *args, **kwargs):
7
+ object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
8
+ object.__setattr__(__self, '__key', kwargs.pop('__key', None))
9
+ object.__setattr__(__self, '__frozen', False)
10
+ for arg in args:
11
+ if not arg:
12
+ continue
13
+ elif isinstance(arg, dict):
14
+ for key, val in arg.items():
15
+ __self[key] = __self._hook(val)
16
+ elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
17
+ __self[arg[0]] = __self._hook(arg[1])
18
+ else:
19
+ for key, val in iter(arg):
20
+ __self[key] = __self._hook(val)
21
+
22
+ for key, val in kwargs.items():
23
+ __self[key] = __self._hook(val)
24
+
25
+ def __setattr__(self, name, value):
26
+ if hasattr(self.__class__, name):
27
+ raise AttributeError("'Dict' object attribute "
28
+ "'{0}' is read-only".format(name))
29
+ else:
30
+ self[name] = value
31
+
32
+ def __setitem__(self, name, value):
33
+ isFrozen = (hasattr(self, '__frozen') and
34
+ object.__getattribute__(self, '__frozen'))
35
+ if isFrozen and name not in super(Dict, self).keys():
36
+ raise KeyError(name)
37
+ super(Dict, self).__setitem__(name, value)
38
+ try:
39
+ p = object.__getattribute__(self, '__parent')
40
+ key = object.__getattribute__(self, '__key')
41
+ except AttributeError:
42
+ p = None
43
+ key = None
44
+ if p is not None:
45
+ p[key] = self
46
+ object.__delattr__(self, '__parent')
47
+ object.__delattr__(self, '__key')
48
+
49
+ def __add__(self, other):
50
+ if not self.keys():
51
+ return other
52
+ else:
53
+ self_type = type(self).__name__
54
+ other_type = type(other).__name__
55
+ msg = "unsupported operand type(s) for +: '{}' and '{}'"
56
+ raise TypeError(msg.format(self_type, other_type))
57
+
58
+ @classmethod
59
+ def _hook(cls, item):
60
+ if isinstance(item, dict):
61
+ return cls(item)
62
+ elif isinstance(item, (list, tuple)):
63
+ return type(item)(cls._hook(elem) for elem in item)
64
+ return item
65
+
66
+ def __getattr__(self, item):
67
+ return self.__getitem__(item)
68
+
69
+ def __missing__(self, name):
70
+ if object.__getattribute__(self, '__frozen'):
71
+ raise KeyError(name)
72
+ return self.__class__(__parent=self, __key=name)
73
+
74
+ def __delattr__(self, name):
75
+ del self[name]
76
+
77
+ def to_dict(self):
78
+ base = {}
79
+ for key, value in self.items():
80
+ if isinstance(value, type(self)):
81
+ base[key] = value.to_dict()
82
+ elif isinstance(value, (list, tuple)):
83
+ base[key] = type(value)(
84
+ item.to_dict() if isinstance(item, type(self)) else
85
+ item for item in value)
86
+ else:
87
+ base[key] = value
88
+ return base
89
+
90
+ def copy(self):
91
+ return copy.copy(self)
92
+
93
+ def deepcopy(self):
94
+ return copy.deepcopy(self)
95
+
96
+ def __deepcopy__(self, memo):
97
+ other = self.__class__()
98
+ memo[id(self)] = other
99
+ for key, value in self.items():
100
+ other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
101
+ return other
102
+
103
+ def update(self, *args, **kwargs):
104
+ other = {}
105
+ if args:
106
+ if len(args) > 1:
107
+ raise TypeError()
108
+ other.update(args[0])
109
+ other.update(kwargs)
110
+ for k, v in other.items():
111
+ if ((k not in self) or
112
+ (not isinstance(self[k], dict)) or
113
+ (not isinstance(v, dict))):
114
+ self[k] = v
115
+ else:
116
+ self[k].update(v)
117
+
118
+ def __getnewargs__(self):
119
+ return tuple(self.items())
120
+
121
+ def __getstate__(self):
122
+ return self
123
+
124
+ def __setstate__(self, state):
125
+ self.update(state)
126
+
127
+ def __or__(self, other):
128
+ if not isinstance(other, (Dict, dict)):
129
+ return NotImplemented
130
+ new = Dict(self)
131
+ new.update(other)
132
+ return new
133
+
134
+ def __ror__(self, other):
135
+ if not isinstance(other, (Dict, dict)):
136
+ return NotImplemented
137
+ new = Dict(other)
138
+ new.update(self)
139
+ return new
140
+
141
+ def __ior__(self, other):
142
+ self.update(other)
143
+ return self
144
+
145
+ def setdefault(self, key, default=None):
146
+ if key in self:
147
+ return self[key]
148
+ else:
149
+ self[key] = default
150
+ return default
151
+
152
+ def freeze(self, shouldFreeze=True):
153
+ object.__setattr__(self, '__frozen', shouldFreeze)
154
+ for key, val in self.items():
155
+ if isinstance(val, Dict):
156
+ val.freeze(shouldFreeze)
157
+
158
+ def unfreeze(self):
159
+ self.freeze(False)