liguang0115 commited on
Commit
2df809d
·
1 Parent(s): 3a9751f

Add initial project structure with core files, configurations, and sample images

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. .gitignore +4 -0
  3. README.md +76 -13
  4. app.py +933 -0
  5. configs/inference/inference.yaml +69 -0
  6. extern/CUT3R/.gitignore +55 -0
  7. extern/CUT3R/LICENSE +6 -0
  8. extern/CUT3R/README.md +208 -0
  9. extern/CUT3R/add_ckpt_path.py +9 -0
  10. extern/CUT3R/cloud_opt/base_opt.py +301 -0
  11. extern/CUT3R/cloud_opt/commons.py +102 -0
  12. extern/CUT3R/cloud_opt/dust3r_opt/__init__.py +31 -0
  13. extern/CUT3R/cloud_opt/dust3r_opt/base_opt.py +620 -0
  14. extern/CUT3R/cloud_opt/dust3r_opt/commons.py +102 -0
  15. extern/CUT3R/cloud_opt/dust3r_opt/init_im_poses.py +382 -0
  16. extern/CUT3R/cloud_opt/dust3r_opt/optimizer.py +341 -0
  17. extern/CUT3R/cloud_opt/init_all.py +222 -0
  18. extern/CUT3R/cloud_opt/utils.py +443 -0
  19. extern/CUT3R/config/dpt_512_vary_4_64.yaml +103 -0
  20. extern/CUT3R/config/linear_224_fixed_16.yaml +99 -0
  21. extern/CUT3R/config/stage1.yaml +74 -0
  22. extern/CUT3R/config/stage2.yaml +132 -0
  23. extern/CUT3R/config/stage3.yaml +219 -0
  24. extern/CUT3R/config/stage4.yaml +219 -0
  25. extern/CUT3R/datasets_preprocess/custom_convert2TUM.py +262 -0
  26. extern/CUT3R/datasets_preprocess/flow_IO.py +476 -0
  27. extern/CUT3R/datasets_preprocess/generate_set_arkitscenes.py +159 -0
  28. extern/CUT3R/datasets_preprocess/generate_set_scannet.py +132 -0
  29. extern/CUT3R/datasets_preprocess/generate_set_scannetpp.py +169 -0
  30. extern/CUT3R/datasets_preprocess/merge_dl3dv.py +85 -0
  31. extern/CUT3R/datasets_preprocess/path_to_root.py +14 -0
  32. extern/CUT3R/datasets_preprocess/preprocess_3dkb.py +220 -0
  33. extern/CUT3R/datasets_preprocess/preprocess_arkitscenes.py +445 -0
  34. extern/CUT3R/datasets_preprocess/preprocess_arkitscenes_highres.py +409 -0
  35. extern/CUT3R/datasets_preprocess/preprocess_bedlam.py +402 -0
  36. extern/CUT3R/datasets_preprocess/preprocess_blendedmvs.py +168 -0
  37. extern/CUT3R/datasets_preprocess/preprocess_co3d.py +391 -0
  38. extern/CUT3R/datasets_preprocess/preprocess_cop3d.py +322 -0
  39. extern/CUT3R/datasets_preprocess/preprocess_dl3dv.py +188 -0
  40. extern/CUT3R/datasets_preprocess/preprocess_dynamic_replica.py +344 -0
  41. extern/CUT3R/datasets_preprocess/preprocess_eden.py +181 -0
  42. extern/CUT3R/datasets_preprocess/preprocess_hoi4d.py +175 -0
  43. extern/CUT3R/datasets_preprocess/preprocess_hypersim.py +268 -0
  44. extern/CUT3R/datasets_preprocess/preprocess_irs.py +230 -0
  45. extern/CUT3R/datasets_preprocess/preprocess_mapfree.py +76 -0
  46. extern/CUT3R/datasets_preprocess/preprocess_mapfree2.py +123 -0
  47. extern/CUT3R/datasets_preprocess/preprocess_megadepth.py +229 -0
  48. extern/CUT3R/datasets_preprocess/preprocess_mp3d.py +217 -0
  49. extern/CUT3R/datasets_preprocess/preprocess_mvimgnet.py +323 -0
  50. extern/CUT3R/datasets_preprocess/preprocess_mvs_synth.py +173 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ test_samples/open_door.jpg filter=lfs diff=lfs merge=lfs -text
37
+ test_samples/oxford.jpeg filter=lfs diff=lfs merge=lfs -text
38
+ test_samples/changi.jpg filter=lfs diff=lfs merge=lfs -text
39
+ test_samples/friends.jpg filter=lfs diff=lfs merge=lfs -text
40
+ test_samples/jesus.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ assets/*
2
+ pycache/*
3
+ __pycache__/*
4
+ .DS_Store
README.md CHANGED
@@ -1,13 +1,76 @@
1
- ---
2
- title: Vmem
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.33.2
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <h1>VMem: Consistent Video Scene Generation with Surfel-Indexed View Memory</h1>
3
+
4
+ <a href="https://v-mem.github.io/"><img src="https://img.shields.io/badge/%F0%9F%8F%A0%20Project%20Page-gray.svg"></a>
5
+ <a href="http://arxiv.org/abs/2503.14489"><img src="https://img.shields.io/badge/%F0%9F%93%84%20arXiv-2503.14489-B31B1B.svg"></a>
6
+ <a href="https://huggingface.co/liguang0115/vmem"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a>
7
+ <a href="https://huggingface.co/spaces/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%9A%80%20Gradio%20Demo-Huggingface-orange"></a>
8
+
9
+ [Runjia Li](https://runjiali-rl.github.io/), [Philip Torr](https://www.robots.ox.ac.uk/~phst/), [Andrea Vedaldi](https://www.robots.ox.ac.uk/~vedaldi/), [Tomas Jakab](https://www.robots.ox.ac.uk/~tomj/)
10
+ <br>
11
+ <br>
12
+ [University of Oxford](https://www.robots.ox.ac.uk/~vgg/)
13
+ </div>
14
+
15
+ <p align="center">
16
+ <img src="assets/demo_teaser.gif" width="100%" alt="Teaser" style="border-radius:10px;"/>
17
+ </p>
18
+
19
+ <!-- <p align="center" border-radius="10px">
20
+ <img src="assets/benchmark.png" width="100%" alt="teaser_page1"/>
21
+ </p> -->
22
+
23
+ # Overview
24
+
25
+ `VMem` is a plug-and-play memory mechanism of image-set models for consistent scene generation.
26
+ Existing methods either rely on inpainting with explicit geometry estimation, which suffers from inaccuracies, or use limited context windows in video-based approaches, leading to poor long-term coherence. To overcome these issues, we introduce Surfel Memory of Views (VMem), which anchors past views to surface elements (surfels) they observed. This enables conditioning novel view generation on the most relevant past views rather than just the most recent ones, enhancing long-term scene consistency while reducing computational cost.
27
+
28
+
29
+ # :wrench: Installation
30
+
31
+ ```bash
32
+ conda create -n vmem python=3.10
33
+ conda activate vmem
34
+ pip install -r requirements.txt
35
+ ```
36
+
37
+
38
+ # :rocket: Usage
39
+
40
+ You need to properly authenticate with Hugging Face to download our model weights. Once set up, our code will handle it automatically at your first run. You can authenticate by running
41
+
42
+ ```bash
43
+ # This will prompt you to enter your Hugging Face credentials.
44
+ huggingface-cli login
45
+ ```
46
+
47
+ Once authenticated, go to our model card [here](https://huggingface.co/stabilityai/stable-virtual-camera) and enter your information for access.
48
+
49
+ We provide a demo for you to interact with `VMem`. Simply run
50
+
51
+ ```bash
52
+ python app.py
53
+ ```
54
+
55
+
56
+ ## :heart: Acknowledgement
57
+ This work is built on top of [CUT3R](https://github.com/CUT3R/CUT3R), [DUSt3R](https://github.com/naver/dust3r) and [Stable Virtual Camera](https://github.com/stability-ai/stable-virtual-camera). We thank them for their great works.
58
+
59
+
60
+
61
+
62
+
63
+ # :books: Citing
64
+
65
+ If you find this repository useful, please consider giving a star :star: and citation.
66
+
67
+ ```
68
+ @article{zhou2025stable,
69
+ title={Stable Virtual Camera: Generative View Synthesis with Diffusion Models},
70
+ author={Jensen (Jinghao) Zhou and Hang Gao and Vikram Voleti and Aaryaman Vasishta and Chun-Han Yao and Mark Boss and
71
+ Philip Torr and Christian Rupprecht and Varun Jampani
72
+ },
73
+ journal={arXiv preprint arXiv:2503.14489},
74
+ year={2025}
75
+ }
76
+ ```
app.py ADDED
@@ -0,0 +1,933 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Literal
2
+ from pathlib import Path
3
+ from functools import partial
4
+ import spaces
5
+ import gradio as gr
6
+ import numpy as np
7
+ import torch
8
+ from torchvision.datasets.utils import download_and_extract_archive
9
+ from einops import repeat
10
+ from omegaconf import OmegaConf
11
+ from modeling.pipeline import VMemPipeline
12
+ from diffusers.utils import export_to_video, export_to_gif
13
+ from scipy.spatial.transform import Rotation, Slerp
14
+ from navigation import Navigator
15
+ from PIL import Image
16
+ from utils import tensor_to_pil, encode_vae_image, encode_image, get_default_intrinsics, load_img_and_K, transform_img_and_K
17
+ import os
18
+ import glob
19
+
20
+
21
+ CONFIG_PATH = "configs/inference/inference.yaml"
22
+ CONFIG = OmegaConf.load(CONFIG_PATH)
23
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ MODEL = VMemPipeline(CONFIG, DEVICE)
25
+ NAVIGATORS = []
26
+
27
+
28
+ NAVIGATION_FPS = 3
29
+ WIDTH = 576
30
+ HEIGHT = 576
31
+
32
+
33
+ IMAGE_PATHS = ['test_samples/changi.jpg', 'test_samples/oxford.jpeg', 'test_samples/open_door.jpg', 'test_samples/jesus.jpg', 'test_samples/friends.jpg']
34
+
35
+ # for asset_dir in ASSET_DIRS:
36
+ # if os.path.exists(asset_dir):
37
+ # for ext in ["*.jpg", "*.jpeg", "*.png"]:
38
+ # IMAGE_PATHS.extend(glob.glob(os.path.join(asset_dir, ext)))
39
+
40
+ # If no images found, create placeholders
41
+ if not IMAGE_PATHS:
42
+ def create_placeholder_images(num_samples=5, height=HEIGHT, width=WIDTH):
43
+ """Create placeholder images for the demo"""
44
+ images = []
45
+ for i in range(num_samples):
46
+ # Create a gradient image as placeholder
47
+ img = np.zeros((height, width, 3), dtype=np.uint8)
48
+ for h in range(height):
49
+ for w in range(width):
50
+ img[h, w, 0] = int(255 * h / height) # Red gradient
51
+ img[h, w, 1] = int(255 * w / width) # Green gradient
52
+ img[h, w, 2] = int(255 * (i+1) / num_samples) # Blue varies by image
53
+ images.append(img)
54
+ return images
55
+
56
+ # Create placeholder video frames and poses
57
+ def create_placeholder_video_and_poses(num_samples=5, num_frames=1, height=HEIGHT, width=WIDTH):
58
+ """Create placeholder videos and poses for the demo"""
59
+ videos = []
60
+ poses = []
61
+
62
+ for i in range(num_samples):
63
+ # Create a simple video (just one frame initially for each sample)
64
+ frames = []
65
+ for j in range(num_frames):
66
+ # Create a gradient frame
67
+ img = np.zeros((height, width, 3), dtype=np.uint8)
68
+ for h in range(height):
69
+ for w in range(width):
70
+ img[h, w, 0] = int(255 * h / height) # Red gradient
71
+ img[h, w, 1] = int(255 * w / width) # Green gradient
72
+ img[h, w, 2] = int(255 * (i+1) / num_samples) # Blue varies by video
73
+
74
+ # Convert to torch tensor [C, H, W] with normalized values
75
+ frame = torch.from_numpy(img.transpose(2, 0, 1)).float() / 255.0
76
+ frames.append(frame)
77
+
78
+ video = torch.stack(frames)
79
+ videos.append(video)
80
+
81
+ # Create placeholder poses (identity matrices flattened)
82
+ # This creates a 4x4 identity matrix flattened to match expected format
83
+ # pose = torch.eye(4).flatten()[:-4] # Remove last row of 4x4 matrix
84
+ poses.append(torch.eye(4).unsqueeze(0).repeat(num_frames, 1, 1))
85
+
86
+ return videos, poses
87
+
88
+ first_frame_list = create_placeholder_images(num_samples=5)
89
+ video_list, poses_list = create_placeholder_video_and_poses(num_samples=5)
90
+
91
+ # Function to load image from path
92
+ def load_image_for_navigation(image_path):
93
+ """Load image from path and prepare for navigation"""
94
+ # Load image and get default intrinsics
95
+ image, _ = load_img_and_K(image_path, None, K=None, device=DEVICE)
96
+
97
+ # Transform image to the target size
98
+ config = OmegaConf.load(CONFIG_PATH)
99
+ image, _ = transform_img_and_K(image, (config.model.height, config.model.width), mode="crop", K=None)
100
+
101
+ # Create initial video with single frame and pose
102
+ video = image
103
+ pose = torch.eye(4).unsqueeze(0) # [1, 4, 4]
104
+
105
+ return {
106
+ "image": tensor_to_pil(image),
107
+ "video": video,
108
+ "pose": pose
109
+ }
110
+
111
+
112
+ class CustomProgressBar:
113
+ def __init__(self, pbar):
114
+ self.pbar = pbar
115
+
116
+ def set_postfix(self, **kwargs):
117
+ pass
118
+
119
+ def __getattr__(self, attr):
120
+ return getattr(self.pbar, attr)
121
+
122
+ def get_duration_navigate_video(video: torch.Tensor,
123
+ poses: torch.Tensor,
124
+ x_angle: float,
125
+ y_angle: float,
126
+ distance: float
127
+ ):
128
+ # Estimate processing time based on navigation complexity and number of frames
129
+ base_duration = 15 # Base duration in seconds
130
+
131
+ # Add time for more complex navigation operations
132
+ if abs(x_angle) > 20 or abs(y_angle) > 30:
133
+ base_duration += 10 # More time for sharp turns
134
+
135
+ if distance > 100:
136
+ base_duration += 10 # More time for longer distances
137
+
138
+ # Add time proportional to existing video length (more frames = more processing)
139
+ base_duration += min(10, len(video))
140
+
141
+ return base_duration
142
+
143
+ @spaces.GPU(duration=get_duration_navigate_video)
144
+ @torch.autocast("cuda")
145
+ @torch.no_grad()
146
+ def navigate_video(
147
+ video: torch.Tensor,
148
+ poses: torch.Tensor,
149
+ x_angle: float,
150
+ y_angle: float,
151
+ distance: float,
152
+ ):
153
+ """
154
+ Generate new video frames by navigating in the 3D scene.
155
+ This function uses the Navigator class from navigation.py to handle movement:
156
+ - y_angle parameter controls left/right turning (turn_left/turn_right methods)
157
+ - distance parameter controls forward movement (move_forward method)
158
+ - x_angle parameter controls vertical angle (not directly implemented in Navigator)
159
+
160
+ Each Navigator instance is stored based on the video session to maintain state.
161
+ """
162
+ try:
163
+ # Convert first frame to PIL Image for navigator
164
+ initial_frame = tensor_to_pil(video[0])
165
+
166
+ # Initialize the navigator for this session if not already done
167
+ if len(NAVIGATORS) == 0:
168
+ # Create a new navigator instance
169
+ NAVIGATORS.append(Navigator(MODEL, step_size=0.1, num_interpolation_frames=4))
170
+
171
+ # Get the initial pose and convert to numpy
172
+ initial_pose = poses[0].cpu().numpy().reshape(4, 4)
173
+
174
+ # Default camera intrinsics if not available
175
+ initial_K = np.array(get_default_intrinsics()[0])
176
+
177
+ # Initialize the navigator
178
+ NAVIGATORS[0].initialize(initial_frame, initial_pose, initial_K)
179
+
180
+ navigator = NAVIGATORS[0]
181
+
182
+ # Generate new frames based on navigation commands
183
+ new_frames = []
184
+
185
+ # First handle any x-angle (vertical angle) adjustments
186
+ # Note: This is approximated as Navigator doesn't directly support this
187
+ if abs(x_angle) > 0:
188
+ # Implementation for x-angle could be added here
189
+ # For now, we'll skip this as it's not directly supported
190
+ pass
191
+
192
+ # Next handle y-angle (turning left/right)
193
+ if abs(y_angle) > 0:
194
+ # Use Navigator's turn methods
195
+ if y_angle > 0:
196
+ new_frames = navigator.turn_left(abs(y_angle//2))
197
+ else:
198
+ new_frames = navigator.turn_right(abs(y_angle//2))
199
+ # Finally handle distance (moving forward)
200
+ elif distance > 0:
201
+ # Calculate number of steps based on distance
202
+ steps = max(1, int(distance / 10))
203
+ new_frames = navigator.move_forward(steps)
204
+ elif distance < 0:
205
+ # Handle moving backward if needed
206
+ steps = max(1, int(abs(distance) / 10))
207
+ new_frames = navigator.move_backward(steps)
208
+
209
+ if not new_frames:
210
+ # If no new frames were generated, return the current state
211
+ return video, poses, tensor_to_pil(video[-1]), export_to_video([tensor_to_pil(video[i]) for i in range(len(video))], fps=NAVIGATION_FPS), [(tensor_to_pil(video[i]), f"t={i}") for i in range(len(video))]
212
+
213
+ # Convert PIL images to tensors
214
+ new_frame_tensors = []
215
+ for frame in new_frames:
216
+ # Convert PIL Image to tensor [C, H, W]
217
+ frame_np = np.array(frame) / 255.0
218
+ # Convert to [-1, 1] range to match the expected format
219
+ frame_tensor = torch.from_numpy(frame_np.transpose(2, 0, 1)).float() * 2.0 - 1.0
220
+ new_frame_tensors.append(frame_tensor)
221
+
222
+ new_frames_tensor = torch.stack(new_frame_tensors)
223
+
224
+ # Get the updated camera poses from the navigator
225
+ current_pose = navigator.current_pose
226
+ new_poses = torch.from_numpy(current_pose).float().unsqueeze(0).repeat(len(new_frames), 1, 1)
227
+
228
+ # Reshape the poses to match the expected format
229
+ new_poses = new_poses.view(len(new_frames), 4, 4)
230
+
231
+ # Concatenate new frames and poses with existing ones
232
+ updated_video = torch.cat([video.cpu(), new_frames_tensor], dim=0)
233
+ updated_poses = torch.cat([poses.cpu(), new_poses], dim=0)
234
+
235
+ # Create output images for gallery
236
+ all_images = [(tensor_to_pil(updated_video[i]), f"t={i}") for i in range(len(updated_video))]
237
+ updated_video_pil = [tensor_to_pil(updated_video[i]) for i in range(len(updated_video))]
238
+
239
+ return (
240
+ updated_video,
241
+ updated_poses,
242
+ tensor_to_pil(updated_video[-1]), # Current view
243
+ export_to_video(updated_video_pil, fps=NAVIGATION_FPS), # Video
244
+ all_images, # Gallery
245
+ )
246
+ except Exception as e:
247
+ print(f"Error in navigate_video: {e}")
248
+ gr.Warning(f"Navigation error: {e}")
249
+ # Return the original inputs to avoid crashes
250
+ current_frame = tensor_to_pil(video[-1]) if len(video) > 0 else None
251
+ all_frames = [(tensor_to_pil(video[i]), f"t={i}") for i in range(len(video))]
252
+ video_frames = [tensor_to_pil(video[i]) for i in range(len(video))]
253
+ video_output = export_to_video(video_frames, fps=NAVIGATION_FPS) if video_frames else None
254
+ return video, poses, current_frame, video_output, all_frames
255
+
256
+
257
+ def undo_navigation(
258
+ video: torch.Tensor,
259
+ poses: torch.Tensor,
260
+ ):
261
+ """
262
+ Undo the last navigation step by removing the last set of frames.
263
+ Uses the Navigator's undo method which in turn uses the pipeline's undo_latest_move
264
+ to properly handle surfels and state management.
265
+ """
266
+ if len(NAVIGATORS) > 0:
267
+ navigator = NAVIGATORS[0]
268
+
269
+ # Call the Navigator's undo method to handle the operation
270
+ success = navigator.undo()
271
+
272
+ if success:
273
+ # Since the navigator has handled the frame removal internally,
274
+ # we need to update our video and poses tensors to match
275
+ updated_video = video[:len(navigator.frames)]
276
+ updated_poses = poses[:len(navigator.frames)]
277
+
278
+ # Create gallery images
279
+ all_images = [(tensor_to_pil(updated_video[i]), f"t={i}") for i in range(len(updated_video))]
280
+
281
+ return (
282
+ updated_video,
283
+ updated_poses,
284
+ tensor_to_pil(updated_video[-1]),
285
+ export_to_video([tensor_to_pil(updated_video[i]) for i in range(len(updated_video))], fps=NAVIGATION_FPS),
286
+ all_images,
287
+ )
288
+ else:
289
+ gr.Warning("You have no moves left to undo!")
290
+ else:
291
+ gr.Warning("No navigation session available!")
292
+
293
+ # If undo wasn't successful or no navigator exists, return original state
294
+ all_images = [(tensor_to_pil(video[i]), f"t={i}") for i in range(len(video))]
295
+
296
+ return (
297
+ video,
298
+ poses,
299
+ tensor_to_pil(video[-1]),
300
+ export_to_video([tensor_to_pil(video[i]) for i in range(len(video))], fps=NAVIGATION_FPS),
301
+ all_images,
302
+ )
303
+
304
+
305
+
306
+
307
+
308
+ def render_demo3(
309
+ s: Literal["Selection", "Generation"],
310
+ idx: int,
311
+ demo3_stage: gr.State,
312
+ demo3_selected_index: gr.State,
313
+ demo3_current_video: gr.State,
314
+ demo3_current_poses: gr.State
315
+ ):
316
+ gr.Markdown(
317
+ """
318
+ ## Single Image → Consistent Scene Navigation
319
+ > #### _Select an image and navigate through the scene by controlling camera movements._
320
+ """,
321
+ elem_classes=["task-title"]
322
+ )
323
+ match s:
324
+ case "Selection":
325
+ with gr.Group():
326
+ # Add upload functionality
327
+ with gr.Group(elem_classes=["gradio-box"]):
328
+ gr.Markdown("### Upload Your Own Image")
329
+ gr.Markdown("_Upload an image to navigate through its 3D scene_")
330
+ with gr.Row():
331
+ with gr.Column(scale=3):
332
+ upload_image = gr.Image(
333
+ label="Upload an image",
334
+ type="filepath",
335
+ height=300,
336
+ elem_id="upload-image"
337
+ )
338
+ with gr.Column(scale=1):
339
+ gr.Markdown("#### Instructions:")
340
+ gr.Markdown("1. Upload a clear, high-quality image")
341
+ gr.Markdown("2. Images with distinct visual features work best")
342
+ gr.Markdown("3. Landscape or architectural scenes are ideal")
343
+ upload_btn = gr.Button("Start Navigation", variant="primary", size="lg")
344
+
345
+ def process_uploaded_image(image_path):
346
+ if image_path is None:
347
+ gr.Warning("Please upload an image first")
348
+ return "Selection", None, None, None
349
+ try:
350
+ # Load image and prepare for navigation
351
+ result = load_image_for_navigation(image_path)
352
+
353
+ # Clear any existing navigators
354
+ global NAVIGATORS
355
+ NAVIGATORS = []
356
+
357
+ return (
358
+ "Generation",
359
+ None, # No predefined index for uploaded images
360
+ result["video"],
361
+ result["pose"],
362
+ )
363
+ except Exception as e:
364
+ print(f"Error in process_uploaded_image: {e}")
365
+ gr.Warning(f"Error processing uploaded image: {e}")
366
+ return "Selection", None, None, None
367
+
368
+ upload_btn.click(
369
+ fn=process_uploaded_image,
370
+ inputs=[upload_image],
371
+ outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
372
+ )
373
+
374
+ gr.Markdown("### Or Choose From Our Examples")
375
+ # Define image captions
376
+ image_captions = {
377
+ 'test_samples/changi.jpg': 'Changi Airport',
378
+ 'test_samples/oxford.jpeg': 'Oxford University',
379
+ 'test_samples/open_door.jpg': 'Bedroom Interior',
380
+ 'test_samples/jesus.jpg': 'Jesus College',
381
+ 'test_samples/friends.jpg': 'Friends Café'
382
+ }
383
+
384
+ # Load all images for the gallery with captions
385
+ gallery_images = []
386
+ for img_path in IMAGE_PATHS:
387
+ try:
388
+ # Get caption or default to basename
389
+ caption = image_captions.get(img_path, os.path.basename(img_path))
390
+ gallery_images.append((img_path, caption))
391
+ except Exception as e:
392
+ print(f"Error loading image {img_path}: {e}")
393
+
394
+ # Show image gallery for selection
395
+ demo3_image_gallery = gr.Gallery(
396
+ value=gallery_images,
397
+ label="Select an Image to Start Navigation",
398
+ columns=len(gallery_images),
399
+ height=400,
400
+ allow_preview=True,
401
+ preview=False,
402
+ elem_id="navigation-gallery"
403
+ )
404
+
405
+ gr.Markdown("_Click on an image to begin navigation_")
406
+
407
+ def start_navigation(evt: gr.SelectData):
408
+ try:
409
+ # Get the selected image path
410
+ selected_path = IMAGE_PATHS[evt.index]
411
+
412
+ # Load image and prepare for navigation
413
+ result = load_image_for_navigation(selected_path)
414
+
415
+ # Clear any existing navigators
416
+ global NAVIGATORS
417
+ NAVIGATORS = []
418
+
419
+ return (
420
+ "Generation",
421
+ evt.index,
422
+ result["video"],
423
+ result["pose"],
424
+ )
425
+ except Exception as e:
426
+ print(f"Error in start_navigation: {e}")
427
+ gr.Warning(f"Error starting navigation: {e}")
428
+ return "Selection", None, None, None
429
+
430
+ demo3_image_gallery.select(
431
+ fn=start_navigation,
432
+ inputs=None,
433
+ outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
434
+ )
435
+
436
+ case "Generation":
437
+ with gr.Row():
438
+ with gr.Column(scale=3):
439
+ with gr.Row():
440
+ demo3_current_view = gr.Image(
441
+ label="Current View",
442
+ width=256,
443
+ height=256,
444
+ )
445
+ demo3_video = gr.Video(
446
+ label="Generated Video",
447
+ width=256,
448
+ height=256,
449
+ autoplay=True,
450
+ loop=True,
451
+ show_share_button=True,
452
+ show_download_button=True,
453
+ )
454
+
455
+ demo3_generated_gallery = gr.Gallery(
456
+ value=[],
457
+ label="Generated Frames",
458
+ columns=[6],
459
+ )
460
+
461
+ # Initialize the current view with the selected image if available
462
+ if idx is not None:
463
+ try:
464
+ selected_path = IMAGE_PATHS[idx]
465
+ result = load_image_for_navigation(selected_path)
466
+ demo3_current_view.value = result["image"]
467
+ except Exception as e:
468
+ print(f"Error initializing current view: {e}")
469
+
470
+ with gr.Column():
471
+ gr.Markdown("### Navigation Controls ↓")
472
+ with gr.Accordion("Instructions", open=False):
473
+ gr.Markdown("""
474
+ - **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
475
+ - **Use the navigation controls to move forward/backward and turn left/right.**
476
+ - **At the end of your navigation, you can save your camera path for later use.**
477
+
478
+ """)
479
+ # with gr.Tab("Basic", elem_id="basic-controls-tab"):
480
+ with gr.Group():
481
+ gr.Markdown("_**Select a direction to move:**_")
482
+ # First row: Turn left/right
483
+ with gr.Row(elem_id="basic-controls"):
484
+ gr.Button(
485
+ "↰20°\nVeer",
486
+ size="sm",
487
+ min_width=0,
488
+ variant="primary",
489
+ ).click(
490
+ fn=partial(
491
+ navigate_video,
492
+ x_angle=0,
493
+ y_angle=20,
494
+ distance=0,
495
+ ),
496
+ inputs=[
497
+ demo3_current_video,
498
+ demo3_current_poses,
499
+ ],
500
+ outputs=[
501
+ demo3_current_video,
502
+ demo3_current_poses,
503
+ demo3_current_view,
504
+ demo3_video,
505
+ demo3_generated_gallery,
506
+ ],
507
+ )
508
+
509
+ gr.Button(
510
+ "↖10°\nTurn",
511
+ size="sm",
512
+ min_width=0,
513
+ variant="primary",
514
+ ).click(
515
+ fn=partial(
516
+ navigate_video,
517
+ x_angle=0,
518
+ y_angle=10,
519
+ distance=0,
520
+ ),
521
+ inputs=[
522
+ demo3_current_video,
523
+ demo3_current_poses,
524
+ ],
525
+ outputs=[
526
+ demo3_current_video,
527
+ demo3_current_poses,
528
+ demo3_current_view,
529
+ demo3_video,
530
+ demo3_generated_gallery,
531
+ ],
532
+ )
533
+
534
+ # gr.Button(
535
+ # "↑0°\nAhead",
536
+ # size="sm",
537
+ # min_width=0,
538
+ # variant="primary",
539
+ # ).click(
540
+ # fn=partial(
541
+ # navigate_video,
542
+ # x_angle=0,
543
+ # y_angle=0,
544
+ # distance=10,
545
+ # ),
546
+ # inputs=[
547
+ # demo3_current_video,
548
+ # demo3_current_poses,
549
+ # ],
550
+ # outputs=[
551
+ # demo3_current_video,
552
+ # demo3_current_poses,
553
+ # demo3_current_view,
554
+ # demo3_video,
555
+ # demo3_generated_gallery,
556
+ # ],
557
+ # )
558
+ gr.Button(
559
+ "↗10°\nTurn",
560
+ size="sm",
561
+ min_width=0,
562
+ variant="primary",
563
+ ).click(
564
+ fn=partial(
565
+ navigate_video,
566
+ x_angle=0,
567
+ y_angle=-10,
568
+ distance=0,
569
+ ),
570
+ inputs=[
571
+ demo3_current_video,
572
+ demo3_current_poses,
573
+ ],
574
+ outputs=[
575
+ demo3_current_video,
576
+ demo3_current_poses,
577
+ demo3_current_view,
578
+ demo3_video,
579
+ demo3_generated_gallery,
580
+ ],
581
+ )
582
+ gr.Button(
583
+ "↱\n20° Veer",
584
+ size="sm",
585
+ min_width=0,
586
+ variant="primary",
587
+ ).click(
588
+ fn=partial(
589
+ navigate_video,
590
+ x_angle=0,
591
+ y_angle=-20,
592
+ distance=0,
593
+ ),
594
+ inputs=[
595
+ demo3_current_video,
596
+ demo3_current_poses,
597
+ ],
598
+ outputs=[
599
+ demo3_current_video,
600
+ demo3_current_poses,
601
+ demo3_current_view,
602
+ demo3_video,
603
+ demo3_generated_gallery,
604
+ ],
605
+ )
606
+
607
+ # Second row: Forward/Backward movement
608
+ with gr.Row(elem_id="forward-backward-controls"):
609
+ gr.Button(
610
+ "↓\nBackward",
611
+ size="sm",
612
+ min_width=0,
613
+ variant="secondary",
614
+ ).click(
615
+ fn=partial(
616
+ navigate_video,
617
+ x_angle=0,
618
+ y_angle=0,
619
+ distance=-10,
620
+ ),
621
+ inputs=[
622
+ demo3_current_video,
623
+ demo3_current_poses,
624
+ ],
625
+ outputs=[
626
+ demo3_current_video,
627
+ demo3_current_poses,
628
+ demo3_current_view,
629
+ demo3_video,
630
+ demo3_generated_gallery,
631
+ ],
632
+ )
633
+
634
+ gr.Button(
635
+ "↑\nForward",
636
+ size="sm",
637
+ min_width=0,
638
+ variant="secondary",
639
+ ).click(
640
+ fn=partial(
641
+ navigate_video,
642
+ x_angle=0,
643
+ y_angle=0,
644
+ distance=10,
645
+ ),
646
+ inputs=[
647
+ demo3_current_video,
648
+ demo3_current_poses,
649
+ ],
650
+ outputs=[
651
+ demo3_current_video,
652
+ demo3_current_poses,
653
+ demo3_current_view,
654
+ demo3_video,
655
+ demo3_generated_gallery,
656
+ ],
657
+ )
658
+ # with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
659
+ # with gr.Group():
660
+ # gr.Markdown("_**Select angles and distance:**_")
661
+
662
+ # demo3_y_angle = gr.Slider(
663
+ # minimum=-90,
664
+ # maximum=90,
665
+ # value=0,
666
+ # step=10,
667
+ # label="Horizontal Angle",
668
+ # interactive=True,
669
+ # )
670
+ # demo3_x_angle = gr.Slider(
671
+ # minimum=-40,
672
+ # maximum=40,
673
+ # value=0,
674
+ # step=10,
675
+ # label="Vertical Angle",
676
+ # interactive=True,
677
+ # )
678
+ # demo3_distance = gr.Slider(
679
+ # minimum=-200,
680
+ # maximum=200,
681
+ # value=100,
682
+ # step=10,
683
+ # label="Distance (negative = backward)",
684
+ # interactive=True,
685
+ # )
686
+
687
+ # gr.Button(
688
+ # "Generate Next Move", variant="primary"
689
+ # ).click(
690
+ # fn=navigate_video,
691
+ # inputs=[
692
+ # demo3_current_video,
693
+ # demo3_current_poses,
694
+ # demo3_x_angle,
695
+ # demo3_y_angle,
696
+ # demo3_distance,
697
+ # ],
698
+ # outputs=[
699
+ # demo3_current_video,
700
+ # demo3_current_poses,
701
+ # demo3_current_view,
702
+ # demo3_video,
703
+ # demo3_generated_gallery,
704
+ # ],
705
+ # )
706
+ gr.Markdown("---")
707
+ with gr.Group():
708
+ gr.Markdown("_**Navigation controls:**_")
709
+ with gr.Row():
710
+ gr.Button("Undo Last Move", variant="huggingface").click(
711
+ fn=undo_navigation,
712
+ inputs=[demo3_current_video, demo3_current_poses],
713
+ outputs=[
714
+ demo3_current_video,
715
+ demo3_current_poses,
716
+ demo3_current_view,
717
+ demo3_video,
718
+ demo3_generated_gallery,
719
+ ],
720
+ )
721
+
722
+ # Add a function to save camera poses
723
+ def save_camera_poses(video, poses):
724
+ if len(NAVIGATORS) > 0:
725
+ navigator = NAVIGATORS[0]
726
+ # Create a directory for saved poses
727
+ os.makedirs("./visualization", exist_ok=True)
728
+ save_path = f"./visualization/transforms_{len(navigator.frames)}_frames.json"
729
+ navigator.save_camera_poses(save_path)
730
+ return gr.Info(f"Camera poses saved to {save_path}")
731
+ return gr.Warning("No navigation instance found")
732
+
733
+ gr.Button("Save Camera", variant="huggingface").click(
734
+ fn=save_camera_poses,
735
+ inputs=[demo3_current_video, demo3_current_poses],
736
+ outputs=[]
737
+ )
738
+
739
+ # Add a button to return to image selection
740
+ def reset_navigation():
741
+ # Clear current navigator
742
+ global NAVIGATORS
743
+ NAVIGATORS = []
744
+ return "Selection", None, None, None
745
+
746
+ gr.Button("Choose New Image", variant="secondary").click(
747
+ fn=reset_navigation,
748
+ inputs=[],
749
+ outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
750
+ )
751
+
752
+
753
+ # Create the Gradio Blocks
754
+ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
755
+ gr.HTML(
756
+ """
757
+ <style>
758
+ [data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
759
+ font-size: 16px !important;
760
+ font-weight: bold;
761
+ }
762
+ #page-title h1 {
763
+ color: #0D9488 !important;
764
+ }
765
+ .task-title h2 {
766
+ color: #F59E0C !important;
767
+ }
768
+ .header-button-row {
769
+ gap: 4px !important;
770
+ }
771
+ .header-button-row div {
772
+ width: 131.0px !important;
773
+ }
774
+ .header-button-column {
775
+ width: 131.0px !important;
776
+ gap: 5px !important;
777
+ }
778
+ .header-button a {
779
+ border: 1px solid #e4e4e7;
780
+ }
781
+ .header-button .button-icon {
782
+ margin-right: 8px;
783
+ }
784
+ .demo-button-column .gap {
785
+ gap: 5px !important;
786
+ }
787
+ #basic-controls {
788
+ column-gap: 0px;
789
+ }
790
+ #basic-controls-tab {
791
+ padding: 0px;
792
+ }
793
+ #advanced-controls-tab {
794
+ padding: 0px;
795
+ }
796
+ #forward-backward-controls {
797
+ column-gap: 0px;
798
+ justify-content: center;
799
+ margin-top: 8px;
800
+ }
801
+ #selected-demo-button {
802
+ color: #F59E0C;
803
+ text-decoration: underline;
804
+ }
805
+ .demo-button {
806
+ text-align: left !important;
807
+ display: block !important;
808
+ }
809
+ #navigation-gallery {
810
+ margin-bottom: 15px;
811
+ }
812
+ #navigation-gallery .gallery-item {
813
+ cursor: pointer;
814
+ border-radius: 6px;
815
+ transition: transform 0.2s, box-shadow 0.2s;
816
+ }
817
+ #navigation-gallery .gallery-item:hover {
818
+ transform: scale(1.02);
819
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
820
+ }
821
+ #navigation-gallery .gallery-item.selected {
822
+ border: 3px solid #0D9488;
823
+ }
824
+ /* Upload image styling */
825
+ #upload-image {
826
+ border-radius: 8px;
827
+ border: 2px dashed #0D9488;
828
+ padding: 10px;
829
+ transition: all 0.3s ease;
830
+ }
831
+ #upload-image:hover {
832
+ border-color: #F59E0C;
833
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
834
+ }
835
+ /* Box styling */
836
+ .gradio-box {
837
+ border-radius: 10px;
838
+ margin-bottom: 20px;
839
+ padding: 15px;
840
+ background-color: #f8f9fa;
841
+ border: 1px solid #e9ecef;
842
+ }
843
+ </style>
844
+ """
845
+ )
846
+
847
+ demo_idx = gr.State(value=3)
848
+
849
+ with gr.Sidebar():
850
+ gr.Markdown("# VMem: Consistent Scene Generation with Surfel Memory of Views", elem_id="page-title")
851
+ gr.Markdown(
852
+ "### Official Interactive Demo for [_VMem_](https://arxiv.org/abs/2502.06764)"
853
+ )
854
+ gr.Markdown("---")
855
+ gr.Markdown("#### Links ↓")
856
+ with gr.Row(elem_classes=["header-button-row"]):
857
+ with gr.Column(elem_classes=["header-button-column"], min_width=0):
858
+ gr.Button(
859
+ value="Website",
860
+ link="https://v-mem.github.io/",
861
+ icon="https://simpleicons.org/icons/googlechrome.svg",
862
+ elem_classes=["header-button"],
863
+ size="md",
864
+ min_width=0,
865
+ )
866
+ gr.Button(
867
+ value="Paper",
868
+ link="https://arxiv.org/abs/2502.06764",
869
+ icon="https://simpleicons.org/icons/arxiv.svg",
870
+ elem_classes=["header-button"],
871
+ size="md",
872
+ min_width=0,
873
+ )
874
+ with gr.Column(elem_classes=["header-button-column"], min_width=0):
875
+ gr.Button(
876
+ value="Code",
877
+ link="https://github.com/kwsong0113/diffusion-forcing-transformer",
878
+ icon="https://simpleicons.org/icons/github.svg",
879
+ elem_classes=["header-button"],
880
+ size="md",
881
+ min_width=0,
882
+ )
883
+ gr.Button(
884
+ value="Weights",
885
+ link="https://huggingface.co/liguang0115/vmem",
886
+ icon="https://simpleicons.org/icons/huggingface.svg",
887
+ elem_classes=["header-button"],
888
+ size="md",
889
+ min_width=0,
890
+ )
891
+ gr.Markdown("---")
892
+ gr.Markdown("#### Choose a Demo ↓")
893
+ with gr.Column(elem_classes=["demo-button-column"]):
894
+ @gr.render(inputs=[demo_idx])
895
+ def render_demo_tabs(idx):
896
+ demo_tab_button3 = gr.Button(
897
+ "Navigate Image",
898
+ size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 3 else {}
899
+ ).click(
900
+ fn=lambda: 3,
901
+ outputs=demo_idx
902
+ )
903
+ gr.Markdown("---")
904
+ gr.Markdown("#### Troubleshooting ↓")
905
+ with gr.Group():
906
+ with gr.Accordion("Error or Unexpected Results?", open=False):
907
+ gr.Markdown("Please try again after refreshing the page and ensure you do not click the same button multiple times.")
908
+ with gr.Accordion("Too Slow or No GPU Allocation?", open=False):
909
+ gr.Markdown(
910
+ "Consider running the demo locally (click the dots in the top-right corner). Alternatively, you can subscribe to Hugging Face Pro for an increased GPU quota."
911
+ )
912
+
913
+
914
+ demo3_stage = gr.State(value="Selection")
915
+ demo3_selected_index = gr.State(value=None)
916
+ demo3_current_video = gr.State(value=None)
917
+ demo3_current_poses = gr.State(value=None)
918
+
919
+ @gr.render(inputs=[demo_idx, demo3_stage, demo3_selected_index])
920
+ def render_demo(
921
+ _demo_idx, _demo3_stage, _demo3_selected_index
922
+ ):
923
+ match _demo_idx:
924
+ case 3:
925
+ render_demo3(_demo3_stage, _demo3_selected_index, demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses)
926
+
927
+
928
+ if __name__ == "__main__":
929
+ demo.launch(debug=True,
930
+ share=True,
931
+ max_threads=1, # Limit concurrent processing
932
+ show_error=True, # Show detailed error messages
933
+ )
configs/inference/inference.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ model:
3
+ height: 576
4
+ width: 576
5
+ original_height: 288
6
+ original_width: 512
7
+ cache_dir: "/homes/55/runjia/storage/svd_weights"
8
+ # pretrained_model_path: "stabilityai/stable-diffusion-2-1"
9
+ # pretrained_video_model_path: "stabilityai/stable-video-diffusion-img2vid"
10
+
11
+ context_num_frames: 4
12
+ target_num_frames: 4
13
+ num_frames: 8
14
+ vae_spatial_scale: 8
15
+ latent_channels: 4
16
+ # num_ray_blocks: 2
17
+ vae_scale_factor: 8
18
+ inference_mode: false
19
+
20
+ temporal_only: false
21
+ use_non_maximum_suppression: true
22
+ translation_distance_weight: 0.1
23
+
24
+ camera_scale: 2.0
25
+ inference_num_steps: 50
26
+ cfg_min: 1.2
27
+ cfg: 3.0
28
+ guider_types: 1
29
+
30
+ samples_dir: "./visualization"
31
+ save_flag: false
32
+ use_wandb: false
33
+
34
+
35
+
36
+ # model_path: "/homes/55/runjia/storage/simview_weights/2025-04-30_12-08-55/checkpoint_230000.pth"
37
+ model_path: "liguang0115/vmem"
38
+
39
+
40
+ surfel:
41
+ use_surfel: true
42
+ shrink_factor: 0.05
43
+ radius_scale: 0.5
44
+ conf_thresh: 1
45
+ merge_position_threshold: 0.2
46
+ merge_normal_threshold: 0.6
47
+ lr: 0.01
48
+ niter: 1000
49
+ model_path: "./extern/CUT3R/src/cut3r_512_dpt_4_64.pth"
50
+ width: 512
51
+ height: 288
52
+
53
+ inference:
54
+ visualize: true
55
+ visualize_pointcloud: false
56
+ visualize_surfel: false
57
+ save_surfels: false
58
+ image_dir: "/homes/55/runjia/storage/realestate10k/video_data/test"
59
+ meta_info_dir: "/homes/55/runjia/storage/realestate10k/RealEstate10K/test"
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+ visualization_dir: "./visualization"
69
+ seed: 42
extern/CUT3R/.gitignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # C extensions
6
+ *.so
7
+
8
+ # Distribution / packaging
9
+ bin/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Installer logs
24
+ pip-log.txt
25
+ pip-delete-this-directory.txt
26
+
27
+ # Unit test / coverage reports
28
+ .tox/
29
+ .coverage
30
+ .cache
31
+ nosetests.xml
32
+ coverage.xml
33
+
34
+ # Translations
35
+ *.mo
36
+
37
+ # Mr Developer
38
+ .mr.developer.cfg
39
+ .project
40
+ .pydevproject
41
+
42
+ # Rope
43
+ .ropeproject
44
+
45
+ # Django stuff:
46
+ *.log
47
+ *.pot
48
+
49
+ # Sphinx documentation
50
+ docs/_build/
51
+
52
+ # Ignore data and ckpts
53
+ *.pth
54
+ data
55
+ src/checkpoints
extern/CUT3R/LICENSE ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Copyright [2025–present]
2
+
3
+ CUT3R is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 License.
4
+
5
+ To view a copy of the CC BY-NC-SA 4.0, visit:
6
+ https://creativecommons.org/licenses/by-nc-sa/4.0/
extern/CUT3R/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Continuous 3D Perception Model with Persistent State
2
+ <div align="center">
3
+ <img src="./assets/factory-ezgif.com-video-speed.gif" alt="CUT3R" />
4
+ </div>
5
+
6
+ <hr>
7
+
8
+ <br>
9
+ Official implementation of <strong>Continuous 3D Perception Model with Persistent State</strong>, CVPR 2025 (Oral)
10
+
11
+ [*QianqianWang**](https://qianqianwang68.github.io/),
12
+ [*Yifei Zhang**](https://forrest-110.github.io/),
13
+ [*Aleksander Holynski*](https://holynski.org/),
14
+ [*Alexei A Efros*](https://people.eecs.berkeley.edu/~efros/),
15
+ [*Angjoo Kanazawa*](https://people.eecs.berkeley.edu/~kanazawa/)
16
+
17
+
18
+ (*: equal contribution)
19
+
20
+ <div style="line-height: 1;">
21
+ <a href="https://cut3r.github.io/" target="_blank" style="margin: 2px;">
22
+ <img alt="Website" src="https://img.shields.io/badge/Website-CUT3R-536af5?color=536af5&logoColor=white" style="display: inline-block; vertical-align: middle;"/>
23
+ </a>
24
+ <a href="https://arxiv.org/pdf/2501.12387" target="_blank" style="margin: 2px;">
25
+ <img alt="Arxiv" src="https://img.shields.io/badge/Arxiv-CUT3R-red?logo=%23B31B1B" style="display: inline-block; vertical-align: middle;"/>
26
+ </a>
27
+ </div>
28
+
29
+
30
+ ![Example of capabilities](assets/ezgif.com-video-to-gif-converter.gif)
31
+
32
+ ## Table of Contents
33
+ - [TODO](#todo)
34
+ - [Get Started](#getting-started)
35
+ - [Installation](#installation)
36
+ - [Checkpoints](#download-checkpoints)
37
+ - [Inference](#inference)
38
+ - [Datasets](#datasets)
39
+ - [Evaluation](#evaluation)
40
+ - [Datasets](#datasets-1)
41
+ - [Evaluation Scripts](#evaluation-scripts)
42
+ - [Training and Fine-tuning](#training-and-fine-tuning)
43
+ - [Acknowledgements](#acknowledgements)
44
+ - [Citation](#citation)
45
+
46
+ ## TODO
47
+ - [x] Release multi-view stereo results of DL3DV dataset.
48
+ - [ ] Online demo integrated with WebCam
49
+
50
+ ## Getting Started
51
+
52
+ ### Installation
53
+
54
+ 1. Clone CUT3R.
55
+ ```bash
56
+ git clone https://github.com/CUT3R/CUT3R.git
57
+ cd CUT3R
58
+ ```
59
+
60
+ 2. Create the environment.
61
+ ```bash
62
+ conda create -n cut3r python=3.11 cmake=3.14.0
63
+ conda activate cut3r
64
+ conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia # use the correct version of cuda for your system
65
+ pip install -r requirements.txt
66
+ # issues with pytorch dataloader, see https://github.com/pytorch/pytorch/issues/99625
67
+ conda install 'llvm-openmp<16'
68
+ # for training logging
69
+ pip install git+https://github.com/nerfstudio-project/gsplat.git
70
+ # for evaluation
71
+ pip install evo
72
+ pip install open3d
73
+ ```
74
+
75
+ 3. Compile the cuda kernels for RoPE (as in CroCo v2).
76
+ ```bash
77
+ cd src/croco/models/curope/
78
+ python setup.py build_ext --inplace
79
+ cd ../../../../
80
+ ```
81
+
82
+ ### Download Checkpoints
83
+
84
+ We currently provide checkpoints on Google Drive:
85
+
86
+ | Modelname | Training resolutions | #Views| Head |
87
+ |-------------|----------------------|-------|------|
88
+ | [`cut3r_224_linear_4.pth`](https://drive.google.com/file/d/11dAgFkWHpaOHsR6iuitlB_v4NFFBrWjy/view?usp=drive_link) | 224x224 | 16 | Linear |
89
+ | [`cut3r_512_dpt_4_64.pth`](https://drive.google.com/file/d/1Asz-ZB3FfpzZYwunhQvNPZEUA8XUNAYD/view?usp=drive_link) | 512x384, 512x336, 512x288, 512x256, 512x160, 384x512, 336x512, 288x512, 256x512, 160x512 | 4-64 | DPT |
90
+
91
+ > `cut3r_224_linear_4.pth` is our intermediate checkpoint and `cut3r_512_dpt_4_64.pth` is our final checkpoint.
92
+
93
+ To download the weights, run the following commands:
94
+ ```bash
95
+ cd src
96
+ # for 224 linear ckpt
97
+ gdown --fuzzy https://drive.google.com/file/d/11dAgFkWHpaOHsR6iuitlB_v4NFFBrWjy/view?usp=drive_link
98
+ # for 512 dpt ckpt
99
+ gdown --fuzzy https://drive.google.com/file/d/1Asz-ZB3FfpzZYwunhQvNPZEUA8XUNAYD/view?usp=drive_link
100
+ cd ..
101
+ ```
102
+
103
+ ### Inference
104
+
105
+ To run the inference code, you can use the following command:
106
+ ```bash
107
+ # the following script will run inference offline and visualize the output with viser on port 8080
108
+ python demo.py --model_path MODEL_PATH --seq_path SEQ_PATH --size SIZE --vis_threshold VIS_THRESHOLD --output_dir OUT_DIR # input can be a folder or a video
109
+ # Example:
110
+ # python demo.py --model_path src/cut3r_512_dpt_4_64.pth --size 512 \
111
+ # --seq_path examples/001 --vis_threshold 1.5 --output_dir tmp
112
+ #
113
+ # python demo.py --model_path src/cut3r_224_linear_4.pth --size 224 \
114
+ # --seq_path examples/001 --vis_threshold 1.5 --output_dir tmp
115
+
116
+ # the following script will run inference with global alignment and visualize the output with viser on port 8080
117
+ python demo_ga.py --model_path MODEL_PATH --seq_path SEQ_PATH --size SIZE --vis_threshold VIS_THRESHOLD --output_dir OUT_DIR
118
+ ```
119
+ Output results will be saved to `output_dir`.
120
+
121
+ > Currently, we accelerate the feedforward process by processing inputs in parallel within the encoder, which results in linear memory consumption as the number of frames increases.
122
+
123
+ ## Datasets
124
+ Our training data includes 32 datasets listed below. We provide processing scripts for all of them. Please download the datasets from their official sources, and refer to [preprocess.md](docs/preprocess.md) for processing scripts and more information about the datasets.
125
+
126
+ - [ARKitScenes](https://github.com/apple/ARKitScenes)
127
+ - [BlendedMVS](https://github.com/YoYo000/BlendedMVS)
128
+ - [CO3Dv2](https://github.com/facebookresearch/co3d)
129
+ - [MegaDepth](https://www.cs.cornell.edu/projects/megadepth/)
130
+ - [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/)
131
+ - [ScanNet](http://www.scan-net.org/ScanNet/)
132
+ - [WayMo Open dataset](https://github.com/waymo-research/waymo-open-dataset)
133
+ - [WildRGB-D](https://github.com/wildrgbd/wildrgbd/)
134
+ - [Map-free](https://research.nianticlabs.com/mapfree-reloc-benchmark/dataset)
135
+ - [TartanAir](https://theairlab.org/tartanair-dataset/)
136
+ - [UnrealStereo4K](https://github.com/fabiotosi92/SMD-Nets)
137
+ - [Virtual KITTI 2](https://europe.naverlabs.com/research/computer-vision/proxy-virtual-worlds-vkitti-2/)
138
+ - [3D Ken Burns](https://github.com/sniklaus/3d-ken-burns.git)
139
+ - [BEDLAM](https://bedlam.is.tue.mpg.de/)
140
+ - [COP3D](https://github.com/facebookresearch/cop3d)
141
+ - [DL3DV](https://github.com/DL3DV-10K/Dataset)
142
+ - [Dynamic Replica](https://github.com/facebookresearch/dynamic_stereo)
143
+ - [EDEN](https://lhoangan.github.io/eden/)
144
+ - [Hypersim](https://github.com/apple/ml-hypersim)
145
+ - [IRS](https://github.com/HKBU-HPML/IRS)
146
+ - [Matterport3D](https://niessner.github.io/Matterport/)
147
+ - [MVImgNet](https://github.com/GAP-LAB-CUHK-SZ/MVImgNet)
148
+ - [MVS-Synth](https://phuang17.github.io/DeepMVS/mvs-synth.html)
149
+ - [OmniObject3D](https://omniobject3d.github.io/)
150
+ - [PointOdyssey](https://pointodyssey.com/)
151
+ - [RealEstate10K](https://google.github.io/realestate10k/)
152
+ - [SmartPortraits](https://mobileroboticsskoltech.github.io/SmartPortraits/)
153
+ - [Spring](https://spring-benchmark.org/)
154
+ - [Synscapes](https://synscapes.on.liu.se/)
155
+ - [UASOL](https://osf.io/64532/)
156
+ - [UrbanSyn](https://www.urbansyn.org/)
157
+ - [HOI4D](https://hoi4d.github.io/)
158
+
159
+
160
+ ## Evaluation
161
+
162
+ ### Datasets
163
+ Please follow [MonST3R](https://github.com/Junyi42/monst3r/blob/main/data/evaluation_script.md) and [Spann3R](https://github.com/HengyiWang/spann3r/blob/main/docs/data_preprocess.md) to prepare **Sintel**, **Bonn**, **KITTI**, **NYU-v2**, **TUM-dynamics**, **ScanNet**, **7scenes** and **Neural-RGBD** datasets.
164
+
165
+ The datasets should be organized as follows:
166
+ ```
167
+ data/
168
+ ├── 7scenes
169
+ ├── bonn
170
+ ├── kitti
171
+ ├── neural_rgbd
172
+ ├── nyu-v2
173
+ ├── scannetv2
174
+ ├── sintel
175
+ └── tum
176
+ ```
177
+
178
+ ### Evaluation Scripts
179
+ Please refer to the [eval.md](docs/eval.md) for more details.
180
+
181
+ ## Training and Fine-tuning
182
+ Please refer to the [train.md](docs/train.md) for more details.
183
+
184
+ ## Acknowledgements
185
+ Our code is based on the following awesome repositories:
186
+
187
+ - [DUSt3R](https://github.com/naver/dust3r)
188
+ - [MonST3R](https://github.com/Junyi42/monst3r.git)
189
+ - [Spann3R](https://github.com/HengyiWang/spann3r.git)
190
+ - [Viser](https://github.com/nerfstudio-project/viser)
191
+
192
+ We thank the authors for releasing their code!
193
+
194
+
195
+
196
+ ## Citation
197
+
198
+ If you find our work useful, please cite:
199
+
200
+ ```bibtex
201
+ @article{wang2025continuous,
202
+ title={Continuous 3D Perception Model with Persistent State},
203
+ author={Wang, Qianqian and Zhang, Yifei and Holynski, Aleksander and Efros, Alexei A and Kanazawa, Angjoo},
204
+ journal={arXiv preprint arXiv:2501.12387},
205
+ year={2025}
206
+ }
207
+ ```
208
+
extern/CUT3R/add_ckpt_path.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import os.path as path
4
+
5
+
6
+ def add_path_to_dust3r(ckpt):
7
+ HERE_PATH = os.path.dirname(os.path.abspath(ckpt))
8
+ # workaround for sibling import
9
+ sys.path.insert(0, HERE_PATH)
extern/CUT3R/cloud_opt/base_opt.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ import cv2
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import roma
7
+ from copy import deepcopy
8
+ import tqdm
9
+ import os
10
+ import matplotlib.pyplot as plt
11
+
12
+ from cloud_opt.utils import *
13
+ from cloud_opt.utils import _check_edges, _compute_img_conf
14
+ import cloud_opt.init_all as init_fun
15
+
16
+
17
+ class BaseOptimizer(nn.Module):
18
+ """Optimize a global scene, given a graph-organized observations.
19
+ Graph node: images
20
+ Graph edges: observations = (pred1, pred2), pred2 is in pred1's coordinate
21
+ """
22
+
23
+ def __init__(self, *args, **kwargs):
24
+ pass
25
+
26
+ def _init_from_views(
27
+ self,
28
+ view1s,
29
+ view2s,
30
+ pred1s,
31
+ pred2s, # whatever predictions, they should be organized into pairwise for graph optimization
32
+ dist="l1",
33
+ conf="log",
34
+ min_conf_thr=3,
35
+ thr_for_init_conf=False,
36
+ base_scale=0.5,
37
+ allow_pw_adaptors=False,
38
+ pw_break=20,
39
+ rand_pose=torch.randn,
40
+ empty_cache=False,
41
+ verbose=True,
42
+ ):
43
+ super().__init__()
44
+ self.edges = [
45
+ (int(view1["idx"]), int(view2["idx"]))
46
+ for view1, view2 in zip(view1s, view2s)
47
+ ]
48
+ self.dist = ALL_DISTS[dist]
49
+ self.n_imgs = _check_edges(self.edges)
50
+
51
+ self.edge2pts_i = NoGradParamDict(
52
+ {ij: pred1s[n]["pts3d_is_self_view"] for n, ij in enumerate(self.str_edges)}
53
+ ) # ij: the name of the edge
54
+ self.edge2pts_j = NoGradParamDict(
55
+ {
56
+ ij: pred2s[n]["pts3d_in_other_view"]
57
+ for n, ij in enumerate(self.str_edges)
58
+ }
59
+ )
60
+ self.edge2conf_i = NoGradParamDict(
61
+ {ij: pred1s[n]["conf_self"] for n, ij in enumerate(self.str_edges)}
62
+ )
63
+ self.edge2conf_j = NoGradParamDict(
64
+ {ij: pred2s[n]["conf"] for n, ij in enumerate(self.str_edges)}
65
+ )
66
+
67
+ self.imshapes = get_imshapes(self.edges, pred1s, pred2s)
68
+ self.min_conf_thr = min_conf_thr
69
+ self.thr_for_init_conf = thr_for_init_conf
70
+ self.conf_trf = get_conf_trf(conf)
71
+
72
+ self.im_conf = _compute_img_conf(
73
+ self.imshapes, self.device, self.edges, self.edge2conf_i, self.edge2conf_j
74
+ )
75
+ for i in range(len(self.im_conf)):
76
+ self.im_conf[i].requires_grad = False
77
+
78
+ self.init_conf_maps = [c.clone() for c in self.im_conf]
79
+
80
+ self.base_scale = base_scale
81
+ self.norm_pw_scale = True
82
+ self.pw_break = pw_break
83
+ self.POSE_DIM = 7
84
+ self.pw_poses = nn.Parameter(
85
+ rand_pose((self.n_edges, 1 + self.POSE_DIM))
86
+ ) # pairwise poses
87
+ self.pw_adaptors = nn.Parameter(
88
+ torch.zeros((self.n_edges, 2))
89
+ ) # slight xy/z adaptation
90
+ self.pw_adaptors.requires_grad_(allow_pw_adaptors)
91
+ self.has_im_poses = False
92
+ self.rand_pose = rand_pose
93
+
94
+ def get_known_poses(self):
95
+ if self.has_im_poses:
96
+ known_poses_msk = torch.tensor(
97
+ [not (p.requires_grad) for p in self.im_poses]
98
+ )
99
+ known_poses = self.get_im_poses()
100
+ return known_poses_msk.sum(), known_poses_msk, known_poses
101
+ else:
102
+ return 0, None, None
103
+
104
+ def get_pw_norm_scale_factor(self):
105
+ if self.norm_pw_scale:
106
+ # normalize scales so that things cannot go south
107
+ # we want that exp(scale) ~= self.base_scale
108
+ return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
109
+ else:
110
+ return 1 # don't norm scale for known poses
111
+
112
+ def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
113
+ # all poses == cam-to-world
114
+ pose = poses[idx]
115
+ if not (pose.requires_grad or force):
116
+ return pose
117
+
118
+ if R.shape == (4, 4):
119
+ assert T is None
120
+ T = R[:3, 3]
121
+ R = R[:3, :3]
122
+
123
+ if R is not None:
124
+ pose.data[0:4] = roma.rotmat_to_unitquat(R)
125
+ if T is not None:
126
+ pose.data[4:7] = signed_log1p(
127
+ T / (scale or 1)
128
+ ) # translation is function of scale
129
+
130
+ if scale is not None:
131
+ assert poses.shape[-1] in (8, 13)
132
+ pose.data[-1] = np.log(float(scale))
133
+ return pose
134
+
135
+ def forward(self, ret_details=False):
136
+ pw_poses = self.get_pw_poses() # cam-to-world
137
+ pw_adapt = self.get_adaptors()
138
+ proj_pts3d = self.get_pts3d()
139
+ # pre-compute pixel weights
140
+ weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
141
+ weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
142
+
143
+ loss = 0
144
+ if ret_details:
145
+ details = -torch.ones((self.n_imgs, self.n_imgs))
146
+
147
+ for e, (i, j) in enumerate(self.edges):
148
+ i_j = edge_str(i, j)
149
+ # distance in image i and j
150
+ aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
151
+ aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
152
+ li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
153
+ lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
154
+ loss = loss + li + lj
155
+
156
+ if ret_details:
157
+ details[i, j] = li + lj
158
+ loss /= self.n_edges # average over all pairs
159
+
160
+ if ret_details:
161
+ return loss, details
162
+ return loss
163
+
164
+ @torch.cuda.amp.autocast(enabled=False)
165
+ def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
166
+ if init is None:
167
+ pass
168
+ elif init == "msp" or init == "mst":
169
+ init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
170
+ elif init == "known_poses":
171
+ raise NotImplementedError
172
+ self.preset_pose(known_poses=self.camera_poses, requires_grad=True)
173
+ init_fun.init_from_known_poses(
174
+ self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP
175
+ )
176
+ else:
177
+ raise ValueError(f"bad value for {init=}")
178
+
179
+ return global_alignment_loop(self, **kw)
180
+
181
+ @property
182
+ def str_edges(self):
183
+ return [edge_str(i, j) for i, j in self.edges]
184
+
185
+ @property
186
+ def n_edges(self):
187
+ return len(self.edges)
188
+
189
+
190
+ def global_alignment_loop(
191
+ net,
192
+ lr=0.01,
193
+ niter=300,
194
+ schedule="cosine",
195
+ lr_min=1e-3,
196
+ temporal_smoothing_weight=0,
197
+ depth_map_save_dir=None,
198
+ ):
199
+ params = [p for p in net.parameters() if p.requires_grad]
200
+ if not params:
201
+ return net
202
+
203
+ verbose = net.verbose
204
+ if verbose:
205
+ print("Global alignement - optimizing for:")
206
+ print([name for name, value in net.named_parameters() if value.requires_grad])
207
+
208
+ lr_base = lr
209
+ optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
210
+
211
+ loss = float("inf")
212
+ if verbose:
213
+ with tqdm.tqdm(total=niter) as bar:
214
+ while bar.n < bar.total:
215
+ if bar.n % 500 == 0 and depth_map_save_dir is not None:
216
+ if not os.path.exists(depth_map_save_dir):
217
+ os.makedirs(depth_map_save_dir)
218
+ # visualize the depthmaps
219
+ depth_maps = net.get_depthmaps()
220
+ for i, depth_map in enumerate(depth_maps):
221
+ depth_map_save_path = os.path.join(
222
+ depth_map_save_dir, f"depthmaps_{i}_iter_{bar.n}.png"
223
+ )
224
+ plt.imsave(
225
+ depth_map_save_path,
226
+ depth_map.detach().cpu().numpy(),
227
+ cmap="jet",
228
+ )
229
+ print(
230
+ f"Saved depthmaps at iteration {bar.n} to {depth_map_save_dir}"
231
+ )
232
+ loss, lr = global_alignment_iter(
233
+ net,
234
+ bar.n,
235
+ niter,
236
+ lr_base,
237
+ lr_min,
238
+ optimizer,
239
+ schedule,
240
+ temporal_smoothing_weight=temporal_smoothing_weight,
241
+ )
242
+ bar.set_postfix_str(f"{lr=:g} loss={loss:g}")
243
+ bar.update()
244
+ else:
245
+ for n in range(niter):
246
+ loss, _ = global_alignment_iter(
247
+ net,
248
+ n,
249
+ niter,
250
+ lr_base,
251
+ lr_min,
252
+ optimizer,
253
+ schedule,
254
+ temporal_smoothing_weight=temporal_smoothing_weight,
255
+ )
256
+ return loss
257
+
258
+
259
+ def global_alignment_iter(
260
+ net,
261
+ cur_iter,
262
+ niter,
263
+ lr_base,
264
+ lr_min,
265
+ optimizer,
266
+ schedule,
267
+ temporal_smoothing_weight=0,
268
+ ):
269
+ t = cur_iter / niter
270
+ if schedule == "cosine":
271
+ lr = cosine_schedule(t, lr_base, lr_min)
272
+ elif schedule == "linear":
273
+ lr = linear_schedule(t, lr_base, lr_min)
274
+ elif schedule.startswith("cycle"):
275
+ try:
276
+ num_cycles = int(schedule[5:])
277
+ except ValueError:
278
+ num_cycles = 2
279
+ lr = cycled_linear_schedule(t, lr_base, lr_min, num_cycles=num_cycles)
280
+ else:
281
+ raise ValueError(f"bad lr {schedule=}")
282
+
283
+ adjust_learning_rate_by_lr(optimizer, lr)
284
+ optimizer.zero_grad()
285
+
286
+ if net.empty_cache:
287
+ torch.cuda.empty_cache()
288
+
289
+ loss = net(epoch=cur_iter)
290
+
291
+ if net.empty_cache:
292
+ torch.cuda.empty_cache()
293
+
294
+ loss.backward()
295
+
296
+ if net.empty_cache:
297
+ torch.cuda.empty_cache()
298
+
299
+ optimizer.step()
300
+
301
+ return float(loss), lr
extern/CUT3R/cloud_opt/commons.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # utility functions for global alignment
6
+ # --------------------------------------------------------
7
+ import torch
8
+ import torch.nn as nn
9
+ import numpy as np
10
+
11
+
12
+ def edge_str(i, j):
13
+ return f"{i}_{j}"
14
+
15
+
16
+ def i_j_ij(ij):
17
+ return edge_str(*ij), ij
18
+
19
+
20
+ def edge_conf(conf_i, conf_j, edge):
21
+ return float(conf_i[edge].mean() * conf_j[edge].mean())
22
+
23
+
24
+ def compute_edge_scores(edges, conf_i, conf_j):
25
+ return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
26
+
27
+
28
+ def NoGradParamDict(x):
29
+ assert isinstance(x, dict)
30
+ return nn.ParameterDict(x).requires_grad_(False)
31
+
32
+
33
+ def get_imshapes(edges, pred_i, pred_j):
34
+ n_imgs = max(max(e) for e in edges) + 1
35
+ imshapes = [None] * n_imgs
36
+ for e, (i, j) in enumerate(edges):
37
+ shape_i = tuple(pred_i[e].shape[0:2])
38
+ shape_j = tuple(pred_j[e].shape[0:2])
39
+ if imshapes[i]:
40
+ assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
41
+ if imshapes[j]:
42
+ assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
43
+ imshapes[i] = shape_i
44
+ imshapes[j] = shape_j
45
+ return imshapes
46
+
47
+
48
+ def get_conf_trf(mode):
49
+ if mode == "log":
50
+
51
+ def conf_trf(x):
52
+ return x.log()
53
+
54
+ elif mode == "sqrt":
55
+
56
+ def conf_trf(x):
57
+ return x.sqrt()
58
+
59
+ elif mode == "m1":
60
+
61
+ def conf_trf(x):
62
+ return x - 1
63
+
64
+ elif mode in ("id", "none"):
65
+
66
+ def conf_trf(x):
67
+ return x
68
+
69
+ else:
70
+ raise ValueError(f"bad mode for {mode=}")
71
+ return conf_trf
72
+
73
+
74
+ def l2_dist(a, b, weight):
75
+ return (a - b).square().sum(dim=-1) * weight
76
+
77
+
78
+ def l1_dist(a, b, weight):
79
+ return (a - b).norm(dim=-1) * weight
80
+
81
+
82
+ ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
83
+
84
+
85
+ def signed_log1p(x):
86
+ sign = torch.sign(x)
87
+ return sign * torch.log1p(torch.abs(x))
88
+
89
+
90
+ def signed_expm1(x):
91
+ sign = torch.sign(x)
92
+ return sign * torch.expm1(torch.abs(x))
93
+
94
+
95
+ def cosine_schedule(t, lr_start, lr_end):
96
+ assert 0 <= t <= 1
97
+ return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
98
+
99
+
100
+ def linear_schedule(t, lr_start, lr_end):
101
+ assert 0 <= t <= 1
102
+ return lr_start + (lr_end - lr_start) * t
extern/CUT3R/cloud_opt/dust3r_opt/__init__.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # global alignment optimization wrapper function
6
+ # --------------------------------------------------------
7
+ from enum import Enum
8
+
9
+ from .optimizer import PointCloudOptimizer
10
+
11
+
12
+ class GlobalAlignerMode(Enum):
13
+ PointCloudOptimizer = "PointCloudOptimizer"
14
+ ModularPointCloudOptimizer = "ModularPointCloudOptimizer"
15
+ PairViewer = "PairViewer"
16
+
17
+
18
+ def global_aligner(
19
+ dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw
20
+ ):
21
+ # extract all inputs
22
+ view1, view2, pred1, pred2 = [
23
+ dust3r_output[k] for k in "view1 view2 pred1 pred2".split()
24
+ ]
25
+ # build the optimizer
26
+ if mode == GlobalAlignerMode.PointCloudOptimizer:
27
+ net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
28
+ else:
29
+ raise NotImplementedError(f"Unknown mode {mode}")
30
+
31
+ return net
extern/CUT3R/cloud_opt/dust3r_opt/base_opt.py ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # Base class for the global alignement procedure
6
+ # --------------------------------------------------------
7
+ from copy import deepcopy
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+ import roma
13
+ from copy import deepcopy
14
+ import tqdm
15
+ import cv2
16
+ from PIL import Image
17
+ from dust3r.utils.geometry import inv, geotrf
18
+ from dust3r.utils.device import to_numpy
19
+ from dust3r.utils.image import rgb
20
+ from dust3r.viz import SceneViz, segment_sky, auto_cam_size
21
+
22
+ from cloud_opt.dust3r_opt.commons import (
23
+ edge_str,
24
+ ALL_DISTS,
25
+ NoGradParamDict,
26
+ get_imshapes,
27
+ signed_expm1,
28
+ signed_log1p,
29
+ cosine_schedule,
30
+ linear_schedule,
31
+ get_conf_trf,
32
+ )
33
+ import cloud_opt.dust3r_opt.init_im_poses as init_fun
34
+ from pathlib import Path
35
+ from scipy.spatial.transform import Rotation
36
+ from evo.core.trajectory import PosePath3D, PoseTrajectory3D
37
+
38
+
39
+ def adjust_learning_rate_by_lr(optimizer, lr):
40
+ for param_group in optimizer.param_groups:
41
+ if "lr_scale" in param_group:
42
+ param_group["lr"] = lr * param_group["lr_scale"]
43
+ else:
44
+ param_group["lr"] = lr
45
+
46
+
47
+ def make_traj(args) -> PoseTrajectory3D:
48
+ if isinstance(args, tuple) or isinstance(args, list):
49
+ traj, tstamps = args
50
+ return PoseTrajectory3D(
51
+ positions_xyz=traj[:, :3],
52
+ orientations_quat_wxyz=traj[:, 3:],
53
+ timestamps=tstamps,
54
+ )
55
+ assert isinstance(args, PoseTrajectory3D), type(args)
56
+ return deepcopy(args)
57
+
58
+
59
+ def save_trajectory_tum_format(traj, filename):
60
+ traj = make_traj(traj)
61
+ tostr = lambda a: " ".join(map(str, a))
62
+ with Path(filename).open("w") as f:
63
+ for i in range(traj.num_poses):
64
+ f.write(
65
+ f"{traj.timestamps[i]} {tostr(traj.positions_xyz[i])} {tostr(traj.orientations_quat_wxyz[i][[0,1,2,3]])}\n"
66
+ )
67
+ print(f"Saved trajectory to {filename}")
68
+
69
+
70
+ def c2w_to_tumpose(c2w):
71
+ """
72
+ Convert a camera-to-world matrix to a tuple of translation and rotation
73
+
74
+ input: c2w: 4x4 matrix
75
+ output: tuple of translation and rotation (x y z qw qx qy qz)
76
+ """
77
+ # convert input to numpy
78
+ c2w = to_numpy(c2w)
79
+ xyz = c2w[:3, -1]
80
+ rot = Rotation.from_matrix(c2w[:3, :3])
81
+ qx, qy, qz, qw = rot.as_quat()
82
+ tum_pose = np.concatenate([xyz, [qw, qx, qy, qz]])
83
+ return tum_pose
84
+
85
+
86
+ class BasePCOptimizer(nn.Module):
87
+ """Optimize a global scene, given a list of pairwise observations.
88
+ Graph node: images
89
+ Graph edges: observations = (pred1, pred2)
90
+ """
91
+
92
+ def __init__(self, *args, **kwargs):
93
+ if len(args) == 1 and len(kwargs) == 0:
94
+ other = deepcopy(args[0])
95
+ attrs = """edges is_symmetrized dist n_imgs pred_i pred_j imshapes
96
+ min_conf_thr conf_thr conf_i conf_j im_conf
97
+ base_scale norm_pw_scale POSE_DIM pw_poses
98
+ pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose""".split()
99
+ self.__dict__.update({k: other[k] for k in attrs})
100
+ else:
101
+ self._init_from_views(*args, **kwargs)
102
+
103
+ def _init_from_views(
104
+ self,
105
+ view1,
106
+ view2,
107
+ pred1,
108
+ pred2,
109
+ dist="l1",
110
+ conf="log",
111
+ min_conf_thr=3,
112
+ base_scale=0.5,
113
+ allow_pw_adaptors=False,
114
+ pw_break=20,
115
+ rand_pose=torch.randn,
116
+ iterationsCount=None,
117
+ verbose=True,
118
+ ):
119
+ super().__init__()
120
+ if not isinstance(view1["idx"], list):
121
+ view1["idx"] = view1["idx"].tolist()
122
+ if not isinstance(view2["idx"], list):
123
+ view2["idx"] = view2["idx"].tolist()
124
+ self.edges = [(int(i), int(j)) for i, j in zip(view1["idx"], view2["idx"])]
125
+ self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
126
+ self.dist = ALL_DISTS[dist]
127
+ self.verbose = verbose
128
+
129
+ self.n_imgs = self._check_edges()
130
+
131
+ # input data
132
+ pred1_pts = pred1["pts3d_in_self_view"]
133
+ pred2_pts = pred2["pts3d_in_other_view"]
134
+ self.pred_i = NoGradParamDict(
135
+ {ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)}
136
+ )
137
+ self.pred_j = NoGradParamDict(
138
+ {ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)}
139
+ )
140
+ self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
141
+
142
+ # work in log-scale with conf
143
+ pred1_conf = pred1["conf_self"]
144
+ pred2_conf = pred2["conf"]
145
+ self.min_conf_thr = min_conf_thr
146
+ self.conf_trf = get_conf_trf(conf)
147
+
148
+ self.conf_i = NoGradParamDict(
149
+ {ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)}
150
+ )
151
+ self.conf_j = NoGradParamDict(
152
+ {ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)}
153
+ )
154
+ self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
155
+ for i in range(len(self.im_conf)):
156
+ self.im_conf[i].requires_grad = False
157
+
158
+ # pairwise pose parameters
159
+ self.base_scale = base_scale
160
+ self.norm_pw_scale = True
161
+ self.pw_break = pw_break
162
+ self.POSE_DIM = 7
163
+ self.pw_poses = nn.Parameter(
164
+ rand_pose((self.n_edges, 1 + self.POSE_DIM))
165
+ ) # pairwise poses
166
+ self.pw_adaptors = nn.Parameter(
167
+ torch.zeros((self.n_edges, 2))
168
+ ) # slight xy/z adaptation
169
+ self.pw_adaptors.requires_grad_(allow_pw_adaptors)
170
+ self.has_im_poses = False
171
+ self.rand_pose = rand_pose
172
+
173
+ # possibly store images for show_pointcloud
174
+ self.imgs = None
175
+ if "img" in view1 and "img" in view2:
176
+ imgs = [torch.zeros((3,) + hw) for hw in self.imshapes]
177
+ for v in range(len(self.edges)):
178
+ idx = view1["idx"][v]
179
+ imgs[idx] = view1["img"][v]
180
+ idx = view2["idx"][v]
181
+ imgs[idx] = view2["img"][v]
182
+ self.imgs = rgb(imgs)
183
+
184
+ @property
185
+ def n_edges(self):
186
+ return len(self.edges)
187
+
188
+ @property
189
+ def str_edges(self):
190
+ return [edge_str(i, j) for i, j in self.edges]
191
+
192
+ @property
193
+ def imsizes(self):
194
+ return [(w, h) for h, w in self.imshapes]
195
+
196
+ @property
197
+ def device(self):
198
+ return next(iter(self.parameters())).device
199
+
200
+ def state_dict(self, trainable=True):
201
+ all_params = super().state_dict()
202
+ return {
203
+ k: v
204
+ for k, v in all_params.items()
205
+ if k.startswith(("_", "pred_i.", "pred_j.", "conf_i.", "conf_j."))
206
+ != trainable
207
+ }
208
+
209
+ def load_state_dict(self, data):
210
+ return super().load_state_dict(self.state_dict(trainable=False) | data)
211
+
212
+ def _check_edges(self):
213
+ indices = sorted({i for edge in self.edges for i in edge})
214
+ assert indices == list(range(len(indices))), "bad pair indices: missing values "
215
+ return len(indices)
216
+
217
+ @torch.no_grad()
218
+ def _compute_img_conf(self, pred1_conf, pred2_conf):
219
+ im_conf = nn.ParameterList(
220
+ [torch.zeros(hw, device=self.device) for hw in self.imshapes]
221
+ )
222
+ for e, (i, j) in enumerate(self.edges):
223
+ im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
224
+ im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
225
+ return im_conf
226
+
227
+ def get_adaptors(self):
228
+ adapt = self.pw_adaptors
229
+ adapt = torch.cat(
230
+ (adapt[:, 0:1], adapt), dim=-1
231
+ ) # (scale_xy, scale_xy, scale_z)
232
+ if self.norm_pw_scale: # normalize so that the product == 1
233
+ adapt = adapt - adapt.mean(dim=1, keepdim=True)
234
+ return (adapt / self.pw_break).exp()
235
+
236
+ def _get_poses(self, poses):
237
+ # normalize rotation
238
+ Q = poses[:, :4]
239
+ T = signed_expm1(poses[:, 4:7])
240
+ RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
241
+ return RT
242
+
243
+ def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
244
+ # all poses == cam-to-world
245
+ pose = poses[idx]
246
+ if not (pose.requires_grad or force):
247
+ return pose
248
+
249
+ if R.shape == (4, 4):
250
+ assert T is None
251
+ T = R[:3, 3]
252
+ R = R[:3, :3]
253
+
254
+ if R is not None:
255
+ pose.data[0:4] = roma.rotmat_to_unitquat(R)
256
+ if T is not None:
257
+ pose.data[4:7] = signed_log1p(
258
+ T / (scale or 1)
259
+ ) # translation is function of scale
260
+
261
+ if scale is not None:
262
+ assert poses.shape[-1] in (8, 13)
263
+ pose.data[-1] = np.log(float(scale))
264
+ return pose
265
+
266
+ def get_pw_norm_scale_factor(self):
267
+ if self.norm_pw_scale:
268
+ # normalize scales so that things cannot go south
269
+ # we want that exp(scale) ~= self.base_scale
270
+ return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
271
+ else:
272
+ return 1 # don't norm scale for known poses
273
+
274
+ def get_pw_scale(self):
275
+ scale = self.pw_poses[:, -1].exp() # (n_edges,)
276
+ scale = scale * self.get_pw_norm_scale_factor()
277
+ return scale
278
+
279
+ def get_pw_poses(self): # cam to world
280
+ RT = self._get_poses(self.pw_poses)
281
+ scaled_RT = RT.clone()
282
+ scaled_RT[:, :3] *= self.get_pw_scale().view(
283
+ -1, 1, 1
284
+ ) # scale the rotation AND translation
285
+ return scaled_RT
286
+
287
+ def get_masks(self):
288
+ return [(conf > self.min_conf_thr) for conf in self.im_conf]
289
+
290
+ def depth_to_pts3d(self):
291
+ raise NotImplementedError()
292
+
293
+ def get_pts3d(self, raw=False):
294
+ res = self.depth_to_pts3d()
295
+ if not raw:
296
+ res = [dm[: h * w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
297
+ return res
298
+
299
+ def _set_focal(self, idx, focal, force=False):
300
+ raise NotImplementedError()
301
+
302
+ def get_focals(self):
303
+ raise NotImplementedError()
304
+
305
+ def get_known_focal_mask(self):
306
+ raise NotImplementedError()
307
+
308
+ def get_principal_points(self):
309
+ raise NotImplementedError()
310
+
311
+ def get_conf(self, mode=None):
312
+ trf = self.conf_trf if mode is None else get_conf_trf(mode)
313
+ return [trf(c) for c in self.im_conf]
314
+
315
+ def get_im_poses(self):
316
+ raise NotImplementedError()
317
+
318
+ def _set_depthmap(self, idx, depth, force=False):
319
+ raise NotImplementedError()
320
+
321
+ def get_depthmaps(self, raw=False):
322
+ raise NotImplementedError()
323
+
324
+ def save_depth_maps(self, path):
325
+ depth_maps = self.get_depthmaps()
326
+ images = []
327
+
328
+ for i, depth_map in enumerate(depth_maps):
329
+ # Apply color map to depth map
330
+ depth_map_colored = cv2.applyColorMap(
331
+ (depth_map * 255).detach().cpu().numpy().astype(np.uint8),
332
+ cv2.COLORMAP_JET,
333
+ )
334
+ img_path = f"{path}/frame_{(i):04d}.png"
335
+ cv2.imwrite(img_path, depth_map_colored)
336
+ images.append(Image.open(img_path))
337
+ np.save(f"{path}/frame_{(i):04d}.npy", depth_map.detach().cpu().numpy())
338
+
339
+ images[0].save(
340
+ f"{path}/_depth_maps.gif",
341
+ save_all=True,
342
+ append_images=images[1:],
343
+ duration=100,
344
+ loop=0,
345
+ )
346
+
347
+ return depth_maps
348
+
349
+ def clean_pointcloud(self, **kw):
350
+ cams = inv(self.get_im_poses())
351
+ K = self.get_intrinsics()
352
+ depthmaps = self.get_depthmaps()
353
+ all_pts3d = self.get_pts3d()
354
+
355
+ new_im_confs = clean_pointcloud(
356
+ self.im_conf, K, cams, depthmaps, all_pts3d, **kw
357
+ )
358
+ for i, new_conf in enumerate(new_im_confs):
359
+ self.im_conf[i].data[:] = new_conf
360
+ return self
361
+
362
+ def get_tum_poses(self):
363
+ poses = self.get_im_poses()
364
+ tt = np.arange(len(poses)).astype(float)
365
+ tum_poses = [c2w_to_tumpose(p) for p in poses]
366
+ tum_poses = np.stack(tum_poses, 0)
367
+ return [tum_poses, tt]
368
+
369
+ def save_tum_poses(self, path):
370
+ traj = self.get_tum_poses()
371
+ save_trajectory_tum_format(traj, path)
372
+ return traj[0] # return the poses
373
+
374
+ def save_focals(self, path):
375
+ # convert focal to txt
376
+ focals = self.get_focals()
377
+ np.savetxt(path, focals.detach().cpu().numpy(), fmt="%.6f")
378
+ return focals
379
+
380
+ def save_intrinsics(self, path):
381
+ K_raw = self.get_intrinsics()
382
+ K = K_raw.reshape(-1, 9)
383
+ np.savetxt(path, K.detach().cpu().numpy(), fmt="%.6f")
384
+ return K_raw
385
+
386
+ def save_conf_maps(self, path):
387
+ conf = self.get_conf()
388
+ for i, c in enumerate(conf):
389
+ np.save(f"{path}/conf_{i}.npy", c.detach().cpu().numpy())
390
+ return conf
391
+
392
+ def save_init_conf_maps(self, path):
393
+ conf = self.get_init_conf()
394
+ for i, c in enumerate(conf):
395
+ np.save(f"{path}/init_conf_{i}.npy", c.detach().cpu().numpy())
396
+ return conf
397
+
398
+ def save_rgb_imgs(self, path):
399
+ imgs = self.imgs
400
+ for i, img in enumerate(imgs):
401
+ # convert from rgb to bgr
402
+ img = img[..., ::-1]
403
+ cv2.imwrite(f"{path}/frame_{i:04d}.png", img * 255)
404
+ return imgs
405
+
406
+ def save_dynamic_masks(self, path):
407
+ dynamic_masks = (
408
+ self.dynamic_masks
409
+ if getattr(self, "sam2_dynamic_masks", None) is None
410
+ else self.sam2_dynamic_masks
411
+ )
412
+ for i, dynamic_mask in enumerate(dynamic_masks):
413
+ cv2.imwrite(
414
+ f"{path}/dynamic_mask_{i}.png",
415
+ (dynamic_mask * 255).detach().cpu().numpy().astype(np.uint8),
416
+ )
417
+ return dynamic_masks
418
+
419
+ def save_depth_maps(self, path):
420
+ depth_maps = self.get_depthmaps()
421
+ images = []
422
+
423
+ for i, depth_map in enumerate(depth_maps):
424
+ # Apply color map to depth map
425
+ depth_map_colored = cv2.applyColorMap(
426
+ (depth_map * 255).detach().cpu().numpy().astype(np.uint8),
427
+ cv2.COLORMAP_JET,
428
+ )
429
+ img_path = f"{path}/frame_{(i):04d}.png"
430
+ cv2.imwrite(img_path, depth_map_colored)
431
+ images.append(Image.open(img_path))
432
+ np.save(f"{path}/frame_{(i):04d}.npy", depth_map.detach().cpu().numpy())
433
+
434
+ images[0].save(
435
+ f"{path}/_depth_maps.gif",
436
+ save_all=True,
437
+ append_images=images[1:],
438
+ duration=100,
439
+ loop=0,
440
+ )
441
+
442
+ return depth_maps
443
+
444
+ def forward(self, ret_details=False):
445
+ pw_poses = self.get_pw_poses() # cam-to-world
446
+ pw_adapt = self.get_adaptors()
447
+ proj_pts3d = self.get_pts3d()
448
+ # pre-compute pixel weights
449
+ weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
450
+ weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
451
+
452
+ loss = 0
453
+ if ret_details:
454
+ details = -torch.ones((self.n_imgs, self.n_imgs))
455
+
456
+ for e, (i, j) in enumerate(self.edges):
457
+ i_j = edge_str(i, j)
458
+ # distance in image i and j
459
+ aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
460
+ aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
461
+ li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
462
+ lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
463
+ loss = loss + li + lj
464
+
465
+ if ret_details:
466
+ details[i, j] = li + lj
467
+ loss /= self.n_edges # average over all pairs
468
+
469
+ if ret_details:
470
+ return loss, details
471
+ return loss
472
+
473
+ @torch.cuda.amp.autocast(enabled=False)
474
+ def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
475
+ if init is None:
476
+ pass
477
+ elif init == "msp" or init == "mst":
478
+ init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
479
+ elif init == "known_poses":
480
+ init_fun.init_from_known_poses(
481
+ self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP
482
+ )
483
+ else:
484
+ raise ValueError(f"bad value for {init=}")
485
+ return global_alignment_loop(self, **kw)
486
+
487
+ @torch.no_grad()
488
+ def mask_sky(self):
489
+ res = deepcopy(self)
490
+ for i in range(self.n_imgs):
491
+ sky = segment_sky(self.imgs[i])
492
+ res.im_conf[i][sky] = 0
493
+ return res
494
+
495
+ def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
496
+ viz = SceneViz()
497
+ if self.imgs is None:
498
+ colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
499
+ colors = list(map(tuple, colors.tolist()))
500
+ for n in range(self.n_imgs):
501
+ viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
502
+ else:
503
+ viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
504
+ colors = np.random.randint(256, size=(self.n_imgs, 3))
505
+
506
+ # camera poses
507
+ im_poses = to_numpy(self.get_im_poses())
508
+ if cam_size is None:
509
+ cam_size = auto_cam_size(im_poses)
510
+ viz.add_cameras(
511
+ im_poses,
512
+ self.get_focals(),
513
+ colors=colors,
514
+ images=self.imgs,
515
+ imsizes=self.imsizes,
516
+ cam_size=cam_size,
517
+ )
518
+ if show_pw_cams:
519
+ pw_poses = self.get_pw_poses()
520
+ viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
521
+
522
+ if show_pw_pts3d:
523
+ pts = [
524
+ geotrf(pw_poses[e], self.pred_i[edge_str(i, j)])
525
+ for e, (i, j) in enumerate(self.edges)
526
+ ]
527
+ viz.add_pointcloud(pts, (128, 0, 128))
528
+
529
+ viz.show(**kw)
530
+ return viz
531
+
532
+
533
+ def global_alignment_loop(net, lr=0.01, niter=300, schedule="cosine", lr_min=1e-6):
534
+ params = [p for p in net.parameters() if p.requires_grad]
535
+ if not params:
536
+ return net
537
+
538
+ verbose = net.verbose
539
+ if verbose:
540
+ print("Global alignement - optimizing for:")
541
+ print([name for name, value in net.named_parameters() if value.requires_grad])
542
+
543
+ lr_base = lr
544
+ optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
545
+
546
+ loss = float("inf")
547
+ if verbose:
548
+ with tqdm.tqdm(total=niter) as bar:
549
+ while bar.n < bar.total:
550
+ loss, lr = global_alignment_iter(
551
+ net, bar.n, niter, lr_base, lr_min, optimizer, schedule
552
+ )
553
+ bar.set_postfix_str(f"{lr=:g} loss={loss:g}")
554
+ bar.update()
555
+ else:
556
+ for n in range(niter):
557
+ loss, _ = global_alignment_iter(
558
+ net, n, niter, lr_base, lr_min, optimizer, schedule
559
+ )
560
+ return loss
561
+
562
+
563
+ def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule):
564
+ t = cur_iter / niter
565
+ if schedule == "cosine":
566
+ lr = cosine_schedule(t, lr_base, lr_min)
567
+ elif schedule == "linear":
568
+ lr = linear_schedule(t, lr_base, lr_min)
569
+ else:
570
+ raise ValueError(f"bad lr {schedule=}")
571
+ adjust_learning_rate_by_lr(optimizer, lr)
572
+ optimizer.zero_grad()
573
+ loss = net()
574
+ loss.backward()
575
+ optimizer.step()
576
+
577
+ return float(loss), lr
578
+
579
+
580
+ @torch.no_grad()
581
+ def clean_pointcloud(
582
+ im_confs, K, cams, depthmaps, all_pts3d, tol=0.001, bad_conf=0, dbg=()
583
+ ):
584
+ """Method:
585
+ 1) express all 3d points in each camera coordinate frame
586
+ 2) if they're in front of a depthmap --> then lower their confidence
587
+ """
588
+ assert len(im_confs) == len(cams) == len(K) == len(depthmaps) == len(all_pts3d)
589
+ assert 0 <= tol < 1
590
+ res = [c.clone() for c in im_confs]
591
+
592
+ # reshape appropriately
593
+ all_pts3d = [p.view(*c.shape, 3) for p, c in zip(all_pts3d, im_confs)]
594
+ depthmaps = [d.view(*c.shape) for d, c in zip(depthmaps, im_confs)]
595
+
596
+ for i, pts3d in enumerate(all_pts3d):
597
+ for j in range(len(all_pts3d)):
598
+ if i == j:
599
+ continue
600
+
601
+ # project 3dpts in other view
602
+ proj = geotrf(cams[j], pts3d)
603
+ proj_depth = proj[:, :, 2]
604
+ u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
605
+
606
+ # check which points are actually in the visible cone
607
+ H, W = im_confs[j].shape
608
+ msk_i = (proj_depth > 0) & (0 <= u) & (u < W) & (0 <= v) & (v < H)
609
+ msk_j = v[msk_i], u[msk_i]
610
+
611
+ # find bad points = those in front but less confident
612
+ bad_points = (proj_depth[msk_i] < (1 - tol) * depthmaps[j][msk_j]) & (
613
+ res[i][msk_i] < res[j][msk_j]
614
+ )
615
+
616
+ bad_msk_i = msk_i.clone()
617
+ bad_msk_i[msk_i] = bad_points
618
+ res[i][bad_msk_i] = res[i][bad_msk_i].clip_(max=bad_conf)
619
+
620
+ return res
extern/CUT3R/cloud_opt/dust3r_opt/commons.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # utility functions for global alignment
6
+ # --------------------------------------------------------
7
+ import torch
8
+ import torch.nn as nn
9
+ import numpy as np
10
+
11
+
12
+ def edge_str(i, j):
13
+ return f"{i}_{j}"
14
+
15
+
16
+ def i_j_ij(ij):
17
+ return edge_str(*ij), ij
18
+
19
+
20
+ def edge_conf(conf_i, conf_j, edge):
21
+ return float(conf_i[edge].mean() * conf_j[edge].mean())
22
+
23
+
24
+ def compute_edge_scores(edges, conf_i, conf_j):
25
+ return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
26
+
27
+
28
+ def NoGradParamDict(x):
29
+ assert isinstance(x, dict)
30
+ return nn.ParameterDict(x).requires_grad_(False)
31
+
32
+
33
+ def get_imshapes(edges, pred_i, pred_j):
34
+ n_imgs = max(max(e) for e in edges) + 1
35
+ imshapes = [None] * n_imgs
36
+ for e, (i, j) in enumerate(edges):
37
+ shape_i = tuple(pred_i[e].shape[0:2])
38
+ shape_j = tuple(pred_j[e].shape[0:2])
39
+ if imshapes[i]:
40
+ assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
41
+ if imshapes[j]:
42
+ assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
43
+ imshapes[i] = shape_i
44
+ imshapes[j] = shape_j
45
+ return imshapes
46
+
47
+
48
+ def get_conf_trf(mode):
49
+ if mode == "log":
50
+
51
+ def conf_trf(x):
52
+ return x.log()
53
+
54
+ elif mode == "sqrt":
55
+
56
+ def conf_trf(x):
57
+ return x.sqrt()
58
+
59
+ elif mode == "m1":
60
+
61
+ def conf_trf(x):
62
+ return x - 1
63
+
64
+ elif mode in ("id", "none"):
65
+
66
+ def conf_trf(x):
67
+ return x
68
+
69
+ else:
70
+ raise ValueError(f"bad mode for {mode=}")
71
+ return conf_trf
72
+
73
+
74
+ def l2_dist(a, b, weight):
75
+ return (a - b).square().sum(dim=-1) * weight
76
+
77
+
78
+ def l1_dist(a, b, weight):
79
+ return (a - b).norm(dim=-1) * weight
80
+
81
+
82
+ ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
83
+
84
+
85
+ def signed_log1p(x):
86
+ sign = torch.sign(x)
87
+ return sign * torch.log1p(torch.abs(x))
88
+
89
+
90
+ def signed_expm1(x):
91
+ sign = torch.sign(x)
92
+ return sign * torch.expm1(torch.abs(x))
93
+
94
+
95
+ def cosine_schedule(t, lr_start, lr_end):
96
+ assert 0 <= t <= 1
97
+ return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
98
+
99
+
100
+ def linear_schedule(t, lr_start, lr_end):
101
+ assert 0 <= t <= 1
102
+ return lr_start + (lr_end - lr_start) * t
extern/CUT3R/cloud_opt/dust3r_opt/init_im_poses.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # Initialization functions for global alignment
6
+ # --------------------------------------------------------
7
+ from functools import cache
8
+
9
+ import numpy as np
10
+ import scipy.sparse as sp
11
+ import torch
12
+ import cv2
13
+ import roma
14
+ from tqdm import tqdm
15
+
16
+ from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
17
+ from dust3r.post_process import estimate_focal_knowing_depth
18
+ from dust3r.viz import to_numpy
19
+
20
+ from cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
21
+
22
+
23
+ @torch.no_grad()
24
+ def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
25
+ device = self.device
26
+
27
+ # indices of known poses
28
+ nkp, known_poses_msk, known_poses = get_known_poses(self)
29
+ assert nkp == self.n_imgs, "not all poses are known"
30
+
31
+ # get all focals
32
+ nkf, _, im_focals = get_known_focals(self)
33
+ assert nkf == self.n_imgs
34
+ im_pp = self.get_principal_points()
35
+
36
+ best_depthmaps = {}
37
+ # init all pairwise poses
38
+ for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)):
39
+ i_j = edge_str(i, j)
40
+
41
+ # find relative pose for this pair
42
+ P1 = torch.eye(4, device=device)
43
+ msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
44
+ _, P2 = fast_pnp(
45
+ self.pred_j[i_j],
46
+ float(im_focals[i].mean()),
47
+ pp=im_pp[i],
48
+ msk=msk,
49
+ device=device,
50
+ niter_PnP=niter_PnP,
51
+ )
52
+
53
+ # align the two predicted camera with the two gt cameras
54
+ s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
55
+ # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
56
+ # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
57
+ self._set_pose(self.pw_poses, e, R, T, scale=s)
58
+
59
+ # remember if this is a good depthmap
60
+ score = float(self.conf_i[i_j].mean())
61
+ if score > best_depthmaps.get(i, (0,))[0]:
62
+ best_depthmaps[i] = score, i_j, s
63
+
64
+ # init all image poses
65
+ for n in range(self.n_imgs):
66
+ assert known_poses_msk[n]
67
+ _, i_j, scale = best_depthmaps[n]
68
+ depth = self.pred_i[i_j][:, :, 2]
69
+ self._set_depthmap(n, depth * scale)
70
+
71
+
72
+ @torch.no_grad()
73
+ def init_minimum_spanning_tree(self, **kw):
74
+ """Init all camera poses (image-wise and pairwise poses) given
75
+ an initial set of pairwise estimations.
76
+ """
77
+ device = self.device
78
+ pts3d, _, im_focals, im_poses = minimum_spanning_tree(
79
+ self.imshapes,
80
+ self.edges,
81
+ self.pred_i,
82
+ self.pred_j,
83
+ self.conf_i,
84
+ self.conf_j,
85
+ self.im_conf,
86
+ self.min_conf_thr,
87
+ device,
88
+ has_im_poses=self.has_im_poses,
89
+ verbose=self.verbose,
90
+ **kw,
91
+ )
92
+
93
+ return init_from_pts3d(self, pts3d, im_focals, im_poses)
94
+
95
+
96
+ def init_from_pts3d(self, pts3d, im_focals, im_poses):
97
+ # init poses
98
+ nkp, known_poses_msk, known_poses = get_known_poses(self)
99
+ if nkp == 1:
100
+ raise NotImplementedError(
101
+ "Would be simpler to just align everything afterwards on the single known pose"
102
+ )
103
+ elif nkp > 1:
104
+ # global rigid SE3 alignment
105
+ s, R, T = align_multiple_poses(
106
+ im_poses[known_poses_msk], known_poses[known_poses_msk]
107
+ )
108
+ trf = sRT_to_4x4(s, R, T, device=known_poses.device)
109
+
110
+ # rotate everything
111
+ im_poses = trf @ im_poses
112
+ im_poses[:, :3, :3] /= s # undo scaling on the rotation part
113
+ for img_pts3d in pts3d:
114
+ img_pts3d[:] = geotrf(trf, img_pts3d)
115
+
116
+ # set all pairwise poses
117
+ for e, (i, j) in enumerate(self.edges):
118
+ i_j = edge_str(i, j)
119
+ # compute transform that goes from cam to world
120
+ s, R, T = rigid_points_registration(
121
+ self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]
122
+ )
123
+ self._set_pose(self.pw_poses, e, R, T, scale=s)
124
+
125
+ # take into account the scale normalization
126
+ s_factor = self.get_pw_norm_scale_factor()
127
+ im_poses[:, :3, 3] *= s_factor # apply downscaling factor
128
+ for img_pts3d in pts3d:
129
+ img_pts3d *= s_factor
130
+
131
+ # init all image poses
132
+ if self.has_im_poses:
133
+ for i in range(self.n_imgs):
134
+ cam2world = im_poses[i]
135
+ depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
136
+ self._set_depthmap(i, depth)
137
+ self._set_pose(self.im_poses, i, cam2world)
138
+ if im_focals[i] is not None:
139
+ self._set_focal(i, im_focals[i])
140
+
141
+ if self.verbose:
142
+ pass
143
+ # print(' init loss =', float(self()))
144
+
145
+
146
+ def minimum_spanning_tree(
147
+ imshapes,
148
+ edges,
149
+ pred_i,
150
+ pred_j,
151
+ conf_i,
152
+ conf_j,
153
+ im_conf,
154
+ min_conf_thr,
155
+ device,
156
+ has_im_poses=True,
157
+ niter_PnP=10,
158
+ verbose=True,
159
+ ):
160
+ n_imgs = len(imshapes)
161
+ sparse_graph = -dict_to_sparse_graph(
162
+ compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j)
163
+ )
164
+ print(sparse_graph)
165
+ msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
166
+
167
+ # temp variable to store 3d points
168
+ pts3d = [None] * len(imshapes)
169
+
170
+ todo = sorted(zip(-msp.data, msp.row, msp.col)) # sorted edges
171
+ im_poses = [None] * n_imgs
172
+ im_focals = [None] * n_imgs
173
+
174
+ # init with strongest edge
175
+ score, i, j = todo.pop()
176
+ if verbose:
177
+ print(f" init edge ({i}*,{j}*) {score=}")
178
+ i_j = edge_str(i, j)
179
+ pts3d[i] = pred_i[i_j].clone()
180
+ pts3d[j] = pred_j[i_j].clone()
181
+ done = {i, j}
182
+ if has_im_poses:
183
+ im_poses[i] = torch.eye(4, device=device)
184
+ im_focals[i] = estimate_focal(pred_i[i_j])
185
+
186
+ # set initial pointcloud based on pairwise graph
187
+ msp_edges = [(i, j)]
188
+ while todo:
189
+ # each time, predict the next one
190
+ score, i, j = todo.pop()
191
+
192
+ if im_focals[i] is None:
193
+ im_focals[i] = estimate_focal(pred_i[i_j])
194
+
195
+ if i in done:
196
+ if verbose:
197
+ print(f" init edge ({i},{j}*) {score=}")
198
+ assert j not in done
199
+ # align pred[i] with pts3d[i], and then set j accordingly
200
+ i_j = edge_str(i, j)
201
+ s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j])
202
+ trf = sRT_to_4x4(s, R, T, device)
203
+ pts3d[j] = geotrf(trf, pred_j[i_j])
204
+ done.add(j)
205
+ msp_edges.append((i, j))
206
+
207
+ if has_im_poses and im_poses[i] is None:
208
+ im_poses[i] = sRT_to_4x4(1, R, T, device)
209
+
210
+ elif j in done:
211
+ if verbose:
212
+ print(f" init edge ({i}*,{j}) {score=}")
213
+ assert i not in done
214
+ i_j = edge_str(i, j)
215
+ s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j])
216
+ trf = sRT_to_4x4(s, R, T, device)
217
+ pts3d[i] = geotrf(trf, pred_i[i_j])
218
+ done.add(i)
219
+ msp_edges.append((i, j))
220
+
221
+ if has_im_poses and im_poses[i] is None:
222
+ im_poses[i] = sRT_to_4x4(1, R, T, device)
223
+ else:
224
+ # let's try again later
225
+ todo.insert(0, (score, i, j))
226
+
227
+ if has_im_poses:
228
+ # complete all missing informations
229
+ pair_scores = list(
230
+ sparse_graph.values()
231
+ ) # already negative scores: less is best
232
+ edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[
233
+ np.argsort(pair_scores)
234
+ ]
235
+ for i, j in edges_from_best_to_worse.tolist():
236
+ if im_focals[i] is None:
237
+ im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
238
+
239
+ for i in range(n_imgs):
240
+ if im_poses[i] is None:
241
+ msk = im_conf[i] > min_conf_thr
242
+ res = fast_pnp(
243
+ pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP
244
+ )
245
+ if res:
246
+ im_focals[i], im_poses[i] = res
247
+ if im_poses[i] is None:
248
+ im_poses[i] = torch.eye(4, device=device)
249
+ im_poses = torch.stack(im_poses)
250
+ else:
251
+ im_poses = im_focals = None
252
+
253
+ return pts3d, msp_edges, im_focals, im_poses
254
+
255
+
256
+ def dict_to_sparse_graph(dic):
257
+ n_imgs = max(max(e) for e in dic) + 1
258
+ res = sp.dok_array((n_imgs, n_imgs))
259
+ for edge, value in dic.items():
260
+ res[edge] = value
261
+ return res
262
+
263
+
264
+ def rigid_points_registration(pts1, pts2, conf):
265
+ R, T, s = roma.rigid_points_registration(
266
+ pts1.reshape(-1, 3),
267
+ pts2.reshape(-1, 3),
268
+ weights=conf.ravel(),
269
+ compute_scaling=True,
270
+ )
271
+ return s, R, T # return un-scaled (R, T)
272
+
273
+
274
+ def sRT_to_4x4(scale, R, T, device):
275
+ trf = torch.eye(4, device=device)
276
+ trf[:3, :3] = R * scale
277
+ trf[:3, 3] = T.ravel() # doesn't need scaling
278
+ return trf
279
+
280
+
281
+ def estimate_focal(pts3d_i, pp=None):
282
+ if pp is None:
283
+ H, W, THREE = pts3d_i.shape
284
+ assert THREE == 3
285
+ pp = torch.tensor((W / 2, H / 2), device=pts3d_i.device)
286
+ focal = estimate_focal_knowing_depth(
287
+ pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode="weiszfeld"
288
+ ).ravel()
289
+ return float(focal)
290
+
291
+
292
+ @cache
293
+ def pixel_grid(H, W):
294
+ return np.mgrid[:W, :H].T.astype(np.float32)
295
+
296
+
297
+ def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
298
+ # extract camera poses and focals with RANSAC-PnP
299
+ if msk.sum() < 4:
300
+ return None # we need at least 4 points for PnP
301
+ pts3d, msk = map(to_numpy, (pts3d, msk))
302
+
303
+ H, W, THREE = pts3d.shape
304
+ assert THREE == 3
305
+ pixels = pixel_grid(H, W)
306
+
307
+ if focal is None:
308
+ S = max(W, H)
309
+ tentative_focals = np.geomspace(S / 2, S * 3, 21)
310
+ else:
311
+ tentative_focals = [focal]
312
+
313
+ if pp is None:
314
+ pp = (W / 2, H / 2)
315
+ else:
316
+ pp = to_numpy(pp)
317
+
318
+ best = (0,)
319
+ for focal in tentative_focals:
320
+ K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
321
+ try:
322
+ success, R, T, inliers = cv2.solvePnPRansac(
323
+ pts3d[msk],
324
+ pixels[msk],
325
+ K,
326
+ None,
327
+ iterationsCount=niter_PnP,
328
+ reprojectionError=5,
329
+ flags=cv2.SOLVEPNP_SQPNP,
330
+ )
331
+ if not success:
332
+ continue
333
+ except:
334
+ continue
335
+
336
+ score = len(inliers)
337
+ if success and score > best[0]:
338
+ best = score, R, T, focal
339
+
340
+ if not best[0]:
341
+ return None
342
+
343
+ _, R, T, best_focal = best
344
+ R = cv2.Rodrigues(R)[0] # world to cam
345
+ R, T = map(torch.from_numpy, (R, T))
346
+ return best_focal, inv(sRT_to_4x4(1, R, T, device)) # cam to world
347
+
348
+
349
+ def get_known_poses(self):
350
+ if self.has_im_poses:
351
+ known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
352
+ known_poses = self.get_im_poses()
353
+ return known_poses_msk.sum(), known_poses_msk, known_poses
354
+ else:
355
+ return 0, None, None
356
+
357
+
358
+ def get_known_focals(self):
359
+ if self.has_im_poses:
360
+ known_focal_msk = self.get_known_focal_mask()
361
+ known_focals = self.get_focals()
362
+ return known_focal_msk.sum(), known_focal_msk, known_focals
363
+ else:
364
+ return 0, None, None
365
+
366
+
367
+ def align_multiple_poses(src_poses, target_poses):
368
+ N = len(src_poses)
369
+ assert src_poses.shape == target_poses.shape == (N, 4, 4)
370
+
371
+ def center_and_z(poses):
372
+ # Add small epsilon to prevent division by zero when all poses are at origin
373
+ eps = max(get_med_dist_between_poses(poses) / 100, 1e-6)
374
+ return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps * poses[:, :3, 2]))
375
+
376
+ R, T, s = roma.rigid_points_registration(
377
+ center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True
378
+ )
379
+ # If scale is too small (near zero), set it to 1 to prevent numerical issues
380
+ if abs(s) < 1e-6:
381
+ s = 1.0
382
+ return s, R, T
extern/CUT3R/cloud_opt/dust3r_opt/optimizer.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # Main class for the implementation of the global alignment
6
+ # --------------------------------------------------------
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn as nn
10
+
11
+ from cloud_opt.dust3r_opt.base_opt import BasePCOptimizer
12
+ from dust3r.utils.geometry import xy_grid, geotrf
13
+ from dust3r.utils.device import to_cpu, to_numpy
14
+
15
+
16
+ class PointCloudOptimizer(BasePCOptimizer):
17
+ """Optimize a global scene, given a list of pairwise observations.
18
+ Graph node: images
19
+ Graph edges: observations = (pred1, pred2)
20
+ """
21
+
22
+ def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
23
+ super().__init__(*args, **kwargs)
24
+
25
+ self.has_im_poses = True # by definition of this class
26
+ self.focal_break = focal_break
27
+
28
+ # adding thing to optimize
29
+ self.im_depthmaps = nn.ParameterList(
30
+ torch.randn(H, W) / 10 - 3 for H, W in self.imshapes
31
+ ) # log(depth)
32
+ self.im_poses = nn.ParameterList(
33
+ self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs)
34
+ ) # camera poses
35
+ self.im_focals = nn.ParameterList(
36
+ torch.FloatTensor([self.focal_break * np.log(max(H, W))])
37
+ for H, W in self.imshapes
38
+ ) # camera intrinsics
39
+ self.im_pp = nn.ParameterList(
40
+ torch.zeros((2,)) for _ in range(self.n_imgs)
41
+ ) # camera intrinsics
42
+ self.im_pp.requires_grad_(optimize_pp)
43
+
44
+ self.imshape = self.imshapes[0]
45
+ im_areas = [h * w for h, w in self.imshapes]
46
+ self.max_area = max(im_areas)
47
+
48
+ # adding thing to optimize
49
+ # self.im_depthmaps = ParameterStack(
50
+ # self.im_depthmaps, is_param=True, fill=self.max_area
51
+ # )
52
+
53
+ self.im_poses = ParameterStack(self.im_poses, is_param=True)
54
+ self.im_focals = ParameterStack(self.im_focals, is_param=True)
55
+ self.im_pp = ParameterStack(self.im_pp, is_param=True)
56
+ self.register_buffer(
57
+ "_pp", torch.tensor([(w / 2, h / 2) for h, w in self.imshapes])
58
+ )
59
+ self.register_buffer(
60
+ "_grid",
61
+ ParameterStack(
62
+ [xy_grid(W, H, device=self.device) for H, W in self.imshapes],
63
+ fill=self.max_area,
64
+ ),
65
+ )
66
+
67
+ # pre-compute pixel weights
68
+ self.register_buffer(
69
+ "_weight_i",
70
+ ParameterStack(
71
+ [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges],
72
+ fill=self.max_area,
73
+ ),
74
+ )
75
+ self.register_buffer(
76
+ "_weight_j",
77
+ ParameterStack(
78
+ [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges],
79
+ fill=self.max_area,
80
+ ),
81
+ )
82
+
83
+ # precompute aa
84
+ self.register_buffer(
85
+ "_stacked_pred_i",
86
+ ParameterStack(self.pred_i, self.str_edges, fill=self.max_area),
87
+ )
88
+ self.register_buffer(
89
+ "_stacked_pred_j",
90
+ ParameterStack(self.pred_j, self.str_edges, fill=self.max_area),
91
+ )
92
+ self.register_buffer("_ei", torch.tensor([i for i, j in self.edges]))
93
+ self.register_buffer("_ej", torch.tensor([j for i, j in self.edges]))
94
+ self.total_area_i = sum([im_areas[i] for i, j in self.edges])
95
+ self.total_area_j = sum([im_areas[j] for i, j in self.edges])
96
+
97
+ def _check_all_imgs_are_selected(self, msk):
98
+ assert np.all(
99
+ self._get_msk_indices(msk) == np.arange(self.n_imgs)
100
+ ), "incomplete mask!"
101
+
102
+ def preset_pose(self, known_poses, pose_msk=None): # cam-to-world
103
+ self._check_all_imgs_are_selected(pose_msk)
104
+
105
+ if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
106
+ known_poses = [known_poses]
107
+ for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
108
+ if self.verbose:
109
+ print(f" (setting pose #{idx} = {pose[:3,3]})")
110
+ self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
111
+
112
+ # normalize scale if there's less than 1 known pose
113
+ self.im_poses.requires_grad_(False)
114
+ for p in self.im_poses:
115
+ print(p.requires_grad)
116
+ print(p.data)
117
+ n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
118
+ self.norm_pw_scale = n_known_poses <= 1
119
+
120
+
121
+ self.norm_pw_scale = False
122
+
123
+ def preset_focal(self, known_focals, msk=None):
124
+ self._check_all_imgs_are_selected(msk)
125
+
126
+ for idx, focal in zip(self._get_msk_indices(msk), known_focals):
127
+ if self.verbose:
128
+ print(f" (setting focal #{idx} = {focal})")
129
+ self._no_grad(self._set_focal(idx, focal))
130
+
131
+ self.im_focals.requires_grad_(False)
132
+
133
+ def preset_principal_point(self, known_pp, msk=None):
134
+ self._check_all_imgs_are_selected(msk)
135
+
136
+ for idx, pp in zip(self._get_msk_indices(msk), known_pp):
137
+ if self.verbose:
138
+ print(f" (setting principal point #{idx} = {pp})")
139
+ self._no_grad(self._set_principal_point(idx, pp))
140
+
141
+ self.im_pp.requires_grad_(False)
142
+
143
+
144
+
145
+
146
+ def _get_msk_indices(self, msk):
147
+ if msk is None:
148
+ return range(self.n_imgs)
149
+ elif isinstance(msk, int):
150
+ return [msk]
151
+ elif isinstance(msk, (tuple, list)):
152
+ return self._get_msk_indices(np.array(msk))
153
+ elif msk.dtype in (bool, torch.bool, np.bool_):
154
+ assert len(msk) == self.n_imgs
155
+ return np.where(msk)[0]
156
+ elif np.issubdtype(msk.dtype, np.integer):
157
+ return msk
158
+ else:
159
+ raise ValueError(f"bad {msk=}")
160
+
161
+ def _no_grad(self, tensor):
162
+ assert (
163
+ tensor.requires_grad
164
+ ), "it must be True at this point, otherwise no modification occurs"
165
+
166
+ def _set_focal(self, idx, focal, force=False):
167
+ param = self.im_focals[idx]
168
+ if (
169
+ param.requires_grad or force
170
+ ): # can only init a parameter not already initialized
171
+ param.data[:] = self.focal_break * np.log(focal)
172
+ return param
173
+
174
+ def get_focals(self):
175
+ log_focals = torch.stack(list(self.im_focals), dim=0)
176
+ return (log_focals / self.focal_break).exp()
177
+
178
+ def get_known_focal_mask(self):
179
+ return torch.tensor([not (p.requires_grad) for p in self.im_focals])
180
+
181
+ def _set_principal_point(self, idx, pp, force=False):
182
+ param = self.im_pp[idx]
183
+ H, W = self.imshapes[idx]
184
+ if (
185
+ param.requires_grad or force
186
+ ): # can only init a parameter not already initialized
187
+ param.data[:] = to_cpu(to_numpy(pp) - (W / 2, H / 2)) / 10
188
+ return param
189
+
190
+ def get_principal_points(self):
191
+ return self._pp + 10 * self.im_pp
192
+
193
+ def get_intrinsics(self):
194
+ K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
195
+ focals = self.get_focals().flatten()
196
+ K[:, 0, 0] = K[:, 1, 1] = focals
197
+ K[:, :2, 2] = self.get_principal_points()
198
+ K[:, 2, 2] = 1
199
+ return K
200
+
201
+ def get_im_poses(self): # cam to world
202
+ cam2world = self._get_poses(self.im_poses)
203
+ return cam2world
204
+
205
+
206
+ def preset_depth(self, known_depths, msk=None):
207
+ """Preset known depth maps for specified images.
208
+
209
+ Args:
210
+ known_depths: List of depth maps or single depth map (should be in normal depth space, not log space)
211
+ msk: Mask or indices indicating which images to preset. If None, applies to all images.
212
+ """
213
+ self._check_all_imgs_are_selected(msk)
214
+
215
+ if isinstance(known_depths, (torch.Tensor, np.ndarray)) and known_depths.ndim == 2:
216
+ known_depths = [known_depths]
217
+
218
+ for idx, depth in zip(self._get_msk_indices(msk), known_depths):
219
+ if self.verbose:
220
+ print(f" (setting depth #{idx})")
221
+ # No need to take log here since _set_depthmap already expects depths in normal space
222
+ depth = _ravel_hw(depth, self.max_area).view(self.imshapes[idx])
223
+ self._no_grad(self._set_depthmap(idx, torch.tensor(depth)))
224
+ self.im_depthmaps[idx].requires_grad_(False)
225
+
226
+
227
+ def _set_depthmap(self, idx, depth, force=False):
228
+ """Set a depth map for an image.
229
+
230
+ Args:
231
+ idx: Image index
232
+ depth: Depth map in normal space (not log space)
233
+ force: Whether to force setting even if already initialized
234
+ """
235
+ depth = _ravel_hw(depth, self.max_area)
236
+ depth = depth.view(self.imshapes[idx])
237
+ depth = depth.nan_to_num(neginf=0)
238
+ param = self.im_depthmaps[idx]
239
+ if (
240
+ param.requires_grad or force
241
+ ): # can only init a parameter not already initialized
242
+ param.data[:] = depth.log().nan_to_num(neginf=0) # Store in log space
243
+ return param
244
+
245
+ def get_depthmaps(self, raw=False):
246
+ res = ParameterStack(self.im_depthmaps, is_param=False).exp()
247
+ if not raw:
248
+ res = [dm[: h * w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
249
+ return res
250
+
251
+ def depth_to_pts3d(self):
252
+ # Get depths and projection params if not provided
253
+ focals = self.get_focals()
254
+ pp = self.get_principal_points()
255
+ im_poses = self.get_im_poses()
256
+ depth = self.get_depthmaps(raw=True)
257
+
258
+ # get pointmaps in camera frame
259
+ rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp)
260
+ # project to world frame
261
+ return geotrf(im_poses, rel_ptmaps)
262
+
263
+ def get_pts3d(self, raw=False):
264
+ res = self.depth_to_pts3d()
265
+ if not raw:
266
+ res = [dm[: h * w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
267
+ return res
268
+
269
+ def forward(self):
270
+ pw_poses = self.get_pw_poses() # cam-to-world
271
+ pw_adapt = self.get_adaptors().unsqueeze(1)
272
+ proj_pts3d = self.get_pts3d(raw=True)
273
+
274
+ # rotate pairwise prediction according to pw_poses
275
+ aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i)
276
+ aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
277
+
278
+ # compute the less
279
+ li = (
280
+ self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum()
281
+ / self.total_area_i
282
+ )
283
+ lj = (
284
+ self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum()
285
+ / self.total_area_j
286
+ )
287
+
288
+ return li + lj
289
+
290
+
291
+ def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
292
+ pp = pp.unsqueeze(1)
293
+ focal = focal.unsqueeze(1)
294
+ if depth.ndim == 3:
295
+ depth = depth.view(depth.shape[0], -1)
296
+ assert focal.shape == (len(depth), 1, 1)
297
+ assert pp.shape == (len(depth), 1, 2)
298
+ assert pixel_grid.shape == depth.shape + (2,)
299
+ depth = depth.unsqueeze(-1)
300
+ return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1)
301
+
302
+
303
+ def ParameterStack(params, keys=None, is_param=None, fill=0):
304
+ if keys is not None:
305
+ params = [params[k] for k in keys]
306
+
307
+ if fill > 0:
308
+ params = [_ravel_hw(p, fill) for p in params]
309
+
310
+ requires_grad = params[0].requires_grad
311
+ assert all(p.requires_grad == requires_grad for p in params) if is_param else True
312
+
313
+ params = torch.stack(list(params)).float().detach()
314
+ if is_param or requires_grad:
315
+ params = nn.Parameter(params)
316
+ params.requires_grad_(requires_grad)
317
+ return params
318
+
319
+
320
+ def _ravel_hw(tensor, fill=0):
321
+ # ravel H,W
322
+ tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
323
+
324
+ if len(tensor) < fill:
325
+ tensor = torch.cat(
326
+ (tensor, tensor.new_zeros((fill - len(tensor),) + tensor.shape[1:]))
327
+ )
328
+ return tensor
329
+
330
+
331
+ def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
332
+ focal_base = max(H, W) / (
333
+ 2 * np.tan(np.deg2rad(60) / 2)
334
+ ) # size / 1.1547005383792515
335
+ return minf * focal_base, maxf * focal_base
336
+
337
+
338
+ def apply_mask(img, msk):
339
+ img = img.copy()
340
+ img[msk] = 0
341
+ return img
extern/CUT3R/cloud_opt/init_all.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import cache
2
+ import numpy as np
3
+ import scipy.sparse as sp
4
+ import torch
5
+ import cv2
6
+ import roma
7
+ from tqdm import tqdm
8
+
9
+ from cloud_opt.utils import *
10
+
11
+
12
+ def compute_edge_scores(edges, edge2conf_i, edge2conf_j):
13
+ """
14
+ edges: 'i_j', (i,j)
15
+ """
16
+ score_dict = {
17
+ (i, j): edge_conf(edge2conf_i[e], edge2conf_j[e]) for e, (i, j) in edges
18
+ }
19
+ return score_dict
20
+
21
+
22
+ def dict_to_sparse_graph(dic):
23
+ n_imgs = max(max(e) for e in dic) + 1
24
+ res = sp.dok_array((n_imgs, n_imgs))
25
+ for edge, value in dic.items():
26
+ res[edge] = value
27
+ return res
28
+
29
+
30
+ @torch.no_grad()
31
+ def init_minimum_spanning_tree(self, **kw):
32
+ """Init all camera poses (image-wise and pairwise poses) given
33
+ an initial set of pairwise estimations.
34
+ """
35
+ device = self.device
36
+ pts3d, _, im_focals, im_poses = minimum_spanning_tree(
37
+ self.imshapes,
38
+ self.edges,
39
+ self.edge2pts_i,
40
+ self.edge2pts_j,
41
+ self.edge2conf_i,
42
+ self.edge2conf_j,
43
+ self.im_conf,
44
+ self.min_conf_thr,
45
+ device,
46
+ has_im_poses=self.has_im_poses,
47
+ verbose=self.verbose,
48
+ **kw,
49
+ )
50
+
51
+ return init_from_pts3d(self, pts3d, im_focals, im_poses)
52
+
53
+
54
+ def minimum_spanning_tree(
55
+ imshapes,
56
+ edges,
57
+ edge2pred_i,
58
+ edge2pred_j,
59
+ edge2conf_i,
60
+ edge2conf_j,
61
+ im_conf,
62
+ min_conf_thr,
63
+ device,
64
+ has_im_poses=True,
65
+ niter_PnP=10,
66
+ verbose=True,
67
+ save_score_path=None,
68
+ ):
69
+ n_imgs = len(imshapes)
70
+ eadge_and_scores = compute_edge_scores(map(i_j_ij, edges), edge2conf_i, edge2conf_j)
71
+ sparse_graph = -dict_to_sparse_graph(eadge_and_scores)
72
+ msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
73
+
74
+ # temp variable to store 3d points
75
+ pts3d = [None] * len(imshapes)
76
+
77
+ todo = sorted(zip(-msp.data, msp.row, msp.col)) # sorted edges
78
+ im_poses = [None] * n_imgs
79
+ im_focals = [None] * n_imgs
80
+
81
+ # init with strongest edge
82
+ score, i, j = todo.pop()
83
+ if verbose:
84
+ print(f" init edge ({i}*,{j}*) {score=}")
85
+ i_j = edge_str(i, j)
86
+
87
+ pts3d[i] = edge2pred_i[i_j].clone()
88
+ pts3d[j] = edge2pred_j[i_j].clone()
89
+ done = {i, j}
90
+ if has_im_poses:
91
+ im_poses[i] = torch.eye(4, device=device)
92
+ im_focals[i] = estimate_focal(edge2pred_i[i_j])
93
+
94
+ # set initial pointcloud based on pairwise graph
95
+ msp_edges = [(i, j)]
96
+ while todo:
97
+ # each time, predict the next one
98
+ score, i, j = todo.pop()
99
+
100
+ if im_focals[i] is None:
101
+ im_focals[i] = estimate_focal(edge2pred_i[i_j])
102
+
103
+ if i in done:
104
+ if verbose:
105
+ print(f" init edge ({i},{j}*) {score=}")
106
+ assert j not in done
107
+ # align pred[i] with pts3d[i], and then set j accordingly
108
+ i_j = edge_str(i, j)
109
+ s, R, T = rigid_points_registration(
110
+ edge2pred_i[i_j], pts3d[i], conf=edge2conf_i[i_j]
111
+ )
112
+ trf = sRT_to_4x4(s, R, T, device)
113
+ pts3d[j] = geotrf(trf, edge2pred_j[i_j])
114
+ done.add(j)
115
+ msp_edges.append((i, j))
116
+
117
+ if has_im_poses and im_poses[i] is None:
118
+ im_poses[i] = sRT_to_4x4(1, R, T, device)
119
+
120
+ elif j in done:
121
+ if verbose:
122
+ print(f" init edge ({i}*,{j}) {score=}")
123
+ assert i not in done
124
+ i_j = edge_str(i, j)
125
+ s, R, T = rigid_points_registration(
126
+ edge2pred_j[i_j], pts3d[j], conf=edge2conf_j[i_j]
127
+ )
128
+ trf = sRT_to_4x4(s, R, T, device)
129
+ pts3d[i] = geotrf(trf, edge2pred_i[i_j])
130
+ done.add(i)
131
+ msp_edges.append((i, j))
132
+
133
+ if has_im_poses and im_poses[i] is None:
134
+ im_poses[i] = sRT_to_4x4(1, R, T, device)
135
+ else:
136
+ # let's try again later
137
+ todo.insert(0, (score, i, j))
138
+
139
+ if has_im_poses:
140
+ # complete all missing informations
141
+ pair_scores = list(
142
+ sparse_graph.values()
143
+ ) # already negative scores: less is best
144
+ edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[
145
+ np.argsort(pair_scores)
146
+ ]
147
+ for i, j in edges_from_best_to_worse.tolist():
148
+ if im_focals[i] is None:
149
+ im_focals[i] = estimate_focal(edge2pred_i[edge_str(i, j)])
150
+
151
+ for i in range(n_imgs):
152
+ if im_poses[i] is None:
153
+ msk = im_conf[i] > min_conf_thr
154
+ res = fast_pnp(
155
+ pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP
156
+ )
157
+ if res:
158
+ im_focals[i], im_poses[i] = res
159
+ if im_poses[i] is None:
160
+ im_poses[i] = torch.eye(4, device=device)
161
+ im_poses = torch.stack(im_poses)
162
+ else:
163
+ im_poses = im_focals = None
164
+
165
+ return pts3d, msp_edges, im_focals, im_poses
166
+
167
+
168
+ def init_from_pts3d(self, pts3d, im_focals, im_poses):
169
+ # init poses
170
+ nkp, known_poses_msk, known_poses = self.get_known_poses()
171
+ if nkp == 1:
172
+ raise NotImplementedError(
173
+ "Would be simpler to just align everything afterwards on the single known pose"
174
+ )
175
+ elif nkp > 1:
176
+ # global rigid SE3 alignment
177
+ s, R, T = align_multiple_poses(
178
+ im_poses[known_poses_msk], known_poses[known_poses_msk]
179
+ )
180
+ trf = sRT_to_4x4(s, R, T, device=known_poses.device)
181
+
182
+ # rotate everything
183
+ im_poses = trf @ im_poses
184
+ im_poses[:, :3, :3] /= s # undo scaling on the rotation part
185
+ for img_pts3d in pts3d:
186
+ img_pts3d[:] = geotrf(trf, img_pts3d)
187
+ else:
188
+ pass # no known poses
189
+
190
+ # set all pairwise poses
191
+ for e, (i, j) in enumerate(self.edges):
192
+ i_j = edge_str(i, j)
193
+ # compute transform that goes from cam to world
194
+ s, R, T = rigid_points_registration(
195
+ self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]
196
+ )
197
+ self._set_pose(self.pw_poses, e, R, T, scale=s)
198
+
199
+ # take into account the scale normalization
200
+ s_factor = self.get_pw_norm_scale_factor()
201
+ im_poses[:, :3, 3] *= s_factor # apply downscaling factor
202
+ for img_pts3d in pts3d:
203
+ img_pts3d *= s_factor
204
+
205
+ # init all image poses
206
+ if self.has_im_poses:
207
+ for i in range(self.n_imgs):
208
+ cam2world = im_poses[i]
209
+ depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
210
+ self._set_depthmap(i, depth)
211
+ self._set_pose(self.im_poses, i, cam2world)
212
+ if im_focals[i] is not None:
213
+ if not self.shared_focal:
214
+ self._set_focal(i, im_focals[i])
215
+ if self.shared_focal:
216
+ self._set_focal(0, sum(im_focals) / self.n_imgs)
217
+ if self.n_imgs > 2:
218
+ self._set_init_depthmap()
219
+
220
+ if self.verbose:
221
+ with torch.no_grad():
222
+ print(" init loss =", float(self()))
extern/CUT3R/cloud_opt/utils.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import roma
4
+ import numpy as np
5
+ import cv2
6
+ from functools import cache
7
+
8
+
9
+ def todevice(batch, device, callback=None, non_blocking=False):
10
+ """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
11
+
12
+ batch: list, tuple, dict of tensors or other things
13
+ device: pytorch device or 'numpy'
14
+ callback: function that would be called on every sub-elements.
15
+ """
16
+ if callback:
17
+ batch = callback(batch)
18
+
19
+ if isinstance(batch, dict):
20
+ return {k: todevice(v, device) for k, v in batch.items()}
21
+
22
+ if isinstance(batch, (tuple, list)):
23
+ return type(batch)(todevice(x, device) for x in batch)
24
+
25
+ x = batch
26
+ if device == "numpy":
27
+ if isinstance(x, torch.Tensor):
28
+ x = x.detach().cpu().numpy()
29
+ elif x is not None:
30
+ if isinstance(x, np.ndarray):
31
+ x = torch.from_numpy(x)
32
+ if torch.is_tensor(x):
33
+ x = x.to(device, non_blocking=non_blocking)
34
+ return x
35
+
36
+
37
+ to_device = todevice # alias
38
+
39
+
40
+ def to_numpy(x):
41
+ return todevice(x, "numpy")
42
+
43
+
44
+ def to_cpu(x):
45
+ return todevice(x, "cpu")
46
+
47
+
48
+ def to_cuda(x):
49
+ return todevice(x, "cuda")
50
+
51
+
52
+ def signed_log1p(x):
53
+ sign = torch.sign(x)
54
+ return sign * torch.log1p(torch.abs(x))
55
+
56
+
57
+ def l2_dist(a, b, weight):
58
+ return (a - b).square().sum(dim=-1) * weight
59
+
60
+
61
+ def l1_dist(a, b, weight):
62
+ return (a - b).norm(dim=-1) * weight
63
+
64
+
65
+ ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
66
+
67
+
68
+ def _check_edges(edges):
69
+ indices = sorted({i for edge in edges for i in edge})
70
+ assert indices == list(range(len(indices))), "bad pair indices: missing values "
71
+ return len(indices)
72
+
73
+
74
+ def NoGradParamDict(x):
75
+ assert isinstance(x, dict)
76
+ return nn.ParameterDict(x).requires_grad_(False)
77
+
78
+
79
+ def edge_str(i, j):
80
+ return f"{i}_{j}"
81
+
82
+
83
+ def i_j_ij(ij):
84
+ # inputs are (i, j)
85
+ return edge_str(*ij), ij
86
+
87
+
88
+ def edge_conf(conf_i, conf_j):
89
+ score = float(conf_i.mean() * conf_j.mean())
90
+ return score
91
+
92
+
93
+ def get_imshapes(edges, pred_i, pred_j):
94
+ n_imgs = max(max(e) for e in edges) + 1
95
+ imshapes = [None] * n_imgs
96
+ for e, (i, j) in enumerate(edges):
97
+ shape_i = tuple(pred_i[e]["pts3d_is_self_view"].shape[0:2])
98
+ shape_j = tuple(pred_j[e]["pts3d_in_other_view"].shape[0:2])
99
+ if imshapes[i]:
100
+ assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
101
+ if imshapes[j]:
102
+ assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
103
+ imshapes[i] = shape_i
104
+ imshapes[j] = shape_j
105
+ return imshapes
106
+
107
+
108
+ def get_conf_trf(mode):
109
+ if mode == "log":
110
+
111
+ def conf_trf(x):
112
+ return x.log()
113
+
114
+ elif mode == "sqrt":
115
+
116
+ def conf_trf(x):
117
+ return x.sqrt()
118
+
119
+ elif mode == "m1":
120
+
121
+ def conf_trf(x):
122
+ return x - 1
123
+
124
+ elif mode in ("id", "none"):
125
+
126
+ def conf_trf(x):
127
+ return x
128
+
129
+ else:
130
+ raise ValueError(f"bad mode for {mode=}")
131
+ return conf_trf
132
+
133
+
134
+ @torch.no_grad()
135
+ def _compute_img_conf(imshapes, device, edges, edge2conf_i, edge2conf_j):
136
+ im_conf = nn.ParameterList([torch.zeros(hw, device=device) for hw in imshapes])
137
+ for e, (i, j) in enumerate(edges):
138
+ im_conf[i] = torch.maximum(im_conf[i], edge2conf_i[edge_str(i, j)])
139
+ im_conf[j] = torch.maximum(im_conf[j], edge2conf_j[edge_str(i, j)])
140
+ return im_conf
141
+
142
+
143
+ def xy_grid(
144
+ W,
145
+ H,
146
+ device=None,
147
+ origin=(0, 0),
148
+ unsqueeze=None,
149
+ cat_dim=-1,
150
+ homogeneous=False,
151
+ **arange_kw,
152
+ ):
153
+ """Output a (H,W,2) array of int32
154
+ with output[j,i,0] = i + origin[0]
155
+ output[j,i,1] = j + origin[1]
156
+ """
157
+ if device is None:
158
+ # numpy
159
+ arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
160
+ else:
161
+ # torch
162
+ arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
163
+ meshgrid, stack = torch.meshgrid, torch.stack
164
+ ones = lambda *a: torch.ones(*a, device=device)
165
+
166
+ tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)]
167
+ grid = meshgrid(tw, th, indexing="xy")
168
+ if homogeneous:
169
+ grid = grid + (ones((H, W)),)
170
+ if unsqueeze is not None:
171
+ grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
172
+ if cat_dim is not None:
173
+ grid = stack(grid, cat_dim)
174
+ return grid
175
+
176
+
177
+ def estimate_focal_knowing_depth(
178
+ pts3d, pp, focal_mode="median", min_focal=0.0, max_focal=np.inf
179
+ ):
180
+ """Reprojection method, for when the absolute depth is known:
181
+ 1) estimate the camera focal using a robust estimator
182
+ 2) reproject points onto true rays, minimizing a certain error
183
+ """
184
+ B, H, W, THREE = pts3d.shape
185
+ assert THREE == 3
186
+
187
+ # centered pixel grid
188
+ pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(
189
+ -1, 1, 2
190
+ ) # B,HW,2
191
+ pts3d = pts3d.flatten(1, 2) # (B, HW, 3)
192
+
193
+ if focal_mode == "median":
194
+ with torch.no_grad():
195
+ # direct estimation of focal
196
+ u, v = pixels.unbind(dim=-1)
197
+ x, y, z = pts3d.unbind(dim=-1)
198
+ fx_votes = (u * z) / x
199
+ fy_votes = (v * z) / y
200
+
201
+ # assume square pixels, hence same focal for X and Y
202
+ f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
203
+ focal = torch.nanmedian(f_votes, dim=-1).values
204
+
205
+ elif focal_mode == "weiszfeld":
206
+ # init focal with l2 closed form
207
+ # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
208
+ xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(
209
+ posinf=0, neginf=0
210
+ ) # homogeneous (x,y,1)
211
+
212
+ dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
213
+ dot_xy_xy = xy_over_z.square().sum(dim=-1)
214
+
215
+ focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
216
+
217
+ # iterative re-weighted least-squares
218
+ for iter in range(10):
219
+ # re-weighting by inverse of distance
220
+ dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
221
+ # print(dis.nanmean(-1))
222
+ w = dis.clip(min=1e-8).reciprocal()
223
+ # update the scaling with the new weights
224
+ focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
225
+ else:
226
+ raise ValueError(f"bad {focal_mode=}")
227
+
228
+ focal_base = max(H, W) / (
229
+ 2 * np.tan(np.deg2rad(60) / 2)
230
+ ) # size / 1.1547005383792515
231
+ focal = focal.clip(min=min_focal * focal_base, max=max_focal * focal_base)
232
+ # print(focal)
233
+ return focal
234
+
235
+
236
+ def estimate_focal(pts3d_i, pp=None):
237
+ if pp is None:
238
+ H, W, THREE = pts3d_i.shape
239
+ assert THREE == 3
240
+ pp = torch.tensor((W / 2, H / 2), device=pts3d_i.device)
241
+ focal = estimate_focal_knowing_depth(
242
+ pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode="weiszfeld"
243
+ ).ravel()
244
+ return float(focal)
245
+
246
+
247
+ def rigid_points_registration(pts1, pts2, conf):
248
+ R, T, s = roma.rigid_points_registration(
249
+ pts1.reshape(-1, 3),
250
+ pts2.reshape(-1, 3),
251
+ weights=conf.ravel(),
252
+ compute_scaling=True,
253
+ )
254
+ return s, R, T # return un-scaled (R, T)
255
+
256
+
257
+ def sRT_to_4x4(scale, R, T, device):
258
+ trf = torch.eye(4, device=device)
259
+ trf[:3, :3] = R * scale
260
+ trf[:3, 3] = T.ravel() # doesn't need scaling
261
+ return trf
262
+
263
+
264
+ def geotrf(Trf, pts, ncol=None, norm=False):
265
+ """Apply a geometric transformation to a list of 3-D points.
266
+
267
+ H: 3x3 or 4x4 projection matrix (typically a Homography)
268
+ p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
269
+
270
+ ncol: int. number of columns of the result (2 or 3)
271
+ norm: float. if != 0, the resut is projected on the z=norm plane.
272
+
273
+ Returns an array of projected 2d points.
274
+ """
275
+ assert Trf.ndim >= 2
276
+ if isinstance(Trf, np.ndarray):
277
+ pts = np.asarray(pts)
278
+ elif isinstance(Trf, torch.Tensor):
279
+ pts = torch.as_tensor(pts, dtype=Trf.dtype)
280
+
281
+ # adapt shape if necessary
282
+ output_reshape = pts.shape[:-1]
283
+ ncol = ncol or pts.shape[-1]
284
+
285
+ # optimized code
286
+ if (
287
+ isinstance(Trf, torch.Tensor)
288
+ and isinstance(pts, torch.Tensor)
289
+ and Trf.ndim == 3
290
+ and pts.ndim == 4
291
+ ):
292
+ d = pts.shape[3]
293
+ if Trf.shape[-1] == d:
294
+ pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
295
+ elif Trf.shape[-1] == d + 1:
296
+ pts = (
297
+ torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts)
298
+ + Trf[:, None, None, :d, d]
299
+ )
300
+ else:
301
+ raise ValueError(f"bad shape, not ending with 3 or 4, for {pts.shape=}")
302
+ else:
303
+ if Trf.ndim >= 3:
304
+ n = Trf.ndim - 2
305
+ assert Trf.shape[:n] == pts.shape[:n], "batch size does not match"
306
+ Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
307
+
308
+ if pts.ndim > Trf.ndim:
309
+ # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
310
+ pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
311
+ elif pts.ndim == 2:
312
+ # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
313
+ pts = pts[:, None, :]
314
+
315
+ if pts.shape[-1] + 1 == Trf.shape[-1]:
316
+ Trf = Trf.swapaxes(-1, -2) # transpose Trf
317
+ pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
318
+ elif pts.shape[-1] == Trf.shape[-1]:
319
+ Trf = Trf.swapaxes(-1, -2) # transpose Trf
320
+ pts = pts @ Trf
321
+ else:
322
+ pts = Trf @ pts.T
323
+ if pts.ndim >= 2:
324
+ pts = pts.swapaxes(-1, -2)
325
+
326
+ if norm:
327
+ pts = pts / pts[..., -1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
328
+ if norm != 1:
329
+ pts *= norm
330
+
331
+ res = pts[..., :ncol].reshape(*output_reshape, ncol)
332
+ return res
333
+
334
+
335
+ def inv(mat):
336
+ """Invert a torch or numpy matrix"""
337
+ if isinstance(mat, torch.Tensor):
338
+ return torch.linalg.inv(mat)
339
+ if isinstance(mat, np.ndarray):
340
+ return np.linalg.inv(mat)
341
+ raise ValueError(f"bad matrix type = {type(mat)}")
342
+
343
+
344
+ @cache
345
+ def pixel_grid(H, W):
346
+ return np.mgrid[:W, :H].T.astype(np.float32)
347
+
348
+
349
+ def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
350
+ # extract camera poses and focals with RANSAC-PnP
351
+ if msk.sum() < 4:
352
+ return None # we need at least 4 points for PnP
353
+ pts3d, msk = map(to_numpy, (pts3d, msk))
354
+
355
+ H, W, THREE = pts3d.shape
356
+ assert THREE == 3
357
+ pixels = pixel_grid(H, W)
358
+
359
+ if focal is None:
360
+ S = max(W, H)
361
+ tentative_focals = np.geomspace(S / 2, S * 3, 21)
362
+ else:
363
+ tentative_focals = [focal]
364
+
365
+ if pp is None:
366
+ pp = (W / 2, H / 2)
367
+ else:
368
+ pp = to_numpy(pp)
369
+
370
+ best = (0,)
371
+ for focal in tentative_focals:
372
+ K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
373
+
374
+ success, R, T, inliers = cv2.solvePnPRansac(
375
+ pts3d[msk],
376
+ pixels[msk],
377
+ K,
378
+ None,
379
+ iterationsCount=niter_PnP,
380
+ reprojectionError=5,
381
+ flags=cv2.SOLVEPNP_SQPNP,
382
+ )
383
+ if not success:
384
+ continue
385
+
386
+ score = len(inliers)
387
+ if success and score > best[0]:
388
+ best = score, R, T, focal
389
+
390
+ if not best[0]:
391
+ return None
392
+
393
+ _, R, T, best_focal = best
394
+ R = cv2.Rodrigues(R)[0] # world to cam
395
+ R, T = map(torch.from_numpy, (R, T))
396
+ return best_focal, inv(sRT_to_4x4(1, R, T, device)) # cam to world
397
+
398
+
399
+ def get_med_dist_between_poses(poses):
400
+ from scipy.spatial.distance import pdist
401
+
402
+ return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
403
+
404
+
405
+ def align_multiple_poses(src_poses, target_poses):
406
+ N = len(src_poses)
407
+ assert src_poses.shape == target_poses.shape == (N, 4, 4)
408
+
409
+ def center_and_z(poses):
410
+ eps = get_med_dist_between_poses(poses) / 100
411
+ return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps * poses[:, :3, 2]))
412
+
413
+ R, T, s = roma.rigid_points_registration(
414
+ center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True
415
+ )
416
+ return s, R, T
417
+
418
+
419
+ def cosine_schedule(t, lr_start, lr_end):
420
+ assert 0 <= t <= 1
421
+ return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
422
+
423
+
424
+ def linear_schedule(t, lr_start, lr_end):
425
+ assert 0 <= t <= 1
426
+ return lr_start + (lr_end - lr_start) * t
427
+
428
+
429
+ def cycled_linear_schedule(t, lr_start, lr_end, num_cycles=2):
430
+ assert 0 <= t <= 1
431
+ cycle_t = t * num_cycles
432
+ cycle_t = cycle_t - int(cycle_t)
433
+ if t == 1:
434
+ cycle_t = 1
435
+ return linear_schedule(cycle_t, lr_start, lr_end)
436
+
437
+
438
+ def adjust_learning_rate_by_lr(optimizer, lr):
439
+ for param_group in optimizer.param_groups:
440
+ if "lr_scale" in param_group:
441
+ param_group["lr"] = lr * param_group["lr_scale"]
442
+ else:
443
+ param_group["lr"] = lr
extern/CUT3R/config/dpt_512_vary_4_64.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: "ARCroco3DStereo(ARCroco3DStereoConfig(freeze='encoder', state_size=768, state_pe='2d', pos_embed='RoPE100', rgb_head=True, pose_head=True, patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, landscape_only=False))"
2
+ pretrained: cut3r_512_dpt_4_64.pth
3
+ load_only_encoder: False
4
+ long_context: True
5
+ fixed_length: False
6
+ resume: null
7
+ benchmark: False
8
+ num_views : 64
9
+ num_test_views : 4
10
+ n_corres_train: 0
11
+ n_corres_test: 0
12
+
13
+ train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
14
+ test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
15
+
16
+ resolution: [(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)]
17
+
18
+ allow_repeat: True
19
+ dataset1: Co3d_Multi(allow_repeat=${allow_repeat}, split='train', ROOT='../../data/dust3r_data/processed_co3d/', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
20
+ dataset2: WildRGBD_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
21
+
22
+ dataset3: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
23
+ dataset4: ARKitScenesHighRes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
24
+ dataset5: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
25
+ dataset6: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_scannet/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
26
+ dataset7: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_hypersim", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
27
+
28
+ dataset8: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
29
+ dataset9: MegaDepth_Multi(allow_repeat=${allow_repeat}, split="train", ROOT="../../data/dust3r_data/processed_megadepth", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
30
+ dataset10: MapFree_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_mapfree/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
31
+ dataset11: Waymo_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/dust3r_data/processed_waymo/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
32
+ dataset12: VirtualKITTI2_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_vkitti", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
33
+ dataset13: UnReal4K_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
34
+ dataset14: TartanAir_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/mast3r_data/processed_tartanair/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
35
+
36
+ dataset15: DL3DV_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_dl3dv", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
37
+
38
+ dataset16: Cop3D_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_cop3d/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
39
+ dataset17: MVImgNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_mvimgnet/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
40
+ dataset18: RE10K_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_re10k/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
41
+ dataset19: OmniObject3D_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_omniobject3d/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
42
+
43
+ dataset20: ThreeDKenBurns(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_3dkb/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
44
+ dataset21: IRS(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_irs/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
45
+ dataset22: SynScapes(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_synscapes/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
46
+ dataset23: UrbanSyn(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_urbansyn/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
47
+ dataset24: EDEN_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_eden", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
48
+ dataset25: SmartPortraits_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_smartportraits", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
49
+
50
+ dataset26: DynamicReplica(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
51
+ dataset27: Spring(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_spring/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
52
+ dataset28: BEDLAM_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_bedlam", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
53
+ dataset29: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_mvs_synth", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
54
+ dataset30: PointOdyssey_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_point_odyssey", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
55
+
56
+ dataset31: UASOL_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../../data/custom_data/processed_uasol", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
57
+ dataset32: MP3D_Multi(allow_repeat=${allow_repeat}, split=None, ROOT="../../data/custom_data/processed_mp3d/", aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208), (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
58
+
59
+ train_dataset: 44800 @ ${dataset1} + 56000 @ ${dataset2} + 56000 @ ${dataset3} + 22400 @ ${dataset4}
60
+ + 16800 @ ${dataset5} + 22400 @ ${dataset6} + 11200 @ ${dataset7}
61
+ + 22400 @ ${dataset8} + 22400 @ ${dataset9} + 84000 @ ${dataset10} + 56000 @ ${dataset11}
62
+ + 5600 @ ${dataset12} + 168 @ ${dataset13} + 56000 @ ${dataset14} + 84000 @ ${dataset15}
63
+ + 480 @ ${dataset16} + 19200 @ ${dataset17} + 4800 @ ${dataset18} + 38400 @ ${dataset19}
64
+ + 26400 @ ${dataset26} + 1200 @ ${dataset27} + 36000 @ ${dataset28} + 2400 @ ${dataset29}
65
+ + 24000 @ ${dataset30} + 14400 @ ${dataset31} + 28800 @ ${dataset32}
66
+ test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=(512, 384), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
67
+
68
+ seed: 0
69
+ batch_size: 4
70
+ accum_iter: 4
71
+ gradient_checkpointing: True
72
+ epochs: 10
73
+ start_epoch: 0
74
+ weight_decay: 0.05
75
+ lr: 1e-6
76
+ min_lr: 1e-7
77
+ warmup_epochs: 0.5
78
+ amp: 1
79
+
80
+ num_workers: 4
81
+ world_size: 1
82
+ local-rank: -1
83
+ dist_url: 'env://'
84
+ rank: 0
85
+ gpu: 0
86
+ distributed: False
87
+ dist_backend: 'nccl'
88
+
89
+ eval_freq: 1
90
+ save_freq: 0.1
91
+ keep_freq: 1
92
+ print_freq: 10
93
+ print_img_freq: 50000000
94
+ num_imgs_vis: 4
95
+ save_dir: 'checkpoints'
96
+ exp_name: 'dpt_512_vary_4_64'
97
+ task: 'cut3r'
98
+ logdir: ./${save_dir}/${exp_name}/logs
99
+ output_dir: ./${save_dir}/${exp_name}/
100
+ hydra:
101
+ verbose: True
102
+ run:
103
+ dir: ./${save_dir}/${exp_name}
extern/CUT3R/config/linear_224_fixed_16.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: "ARCroco3DStereo(ARCroco3DStereoConfig(freeze='encoder', state_size=768, state_pe='2d', pos_embed='RoPE100', rgb_head=True, pose_head=True, img_size=(224, 224), head_type='linear', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12))"
2
+ pretrained: cut3r_224_linear_4.pth
3
+ load_only_encoder: False
4
+ long_context: False
5
+ fixed_length: True
6
+ resume: null
7
+ benchmark: True
8
+ num_views : 16
9
+ num_test_views : 4
10
+ n_corres_train: 0
11
+ n_corres_test: 0
12
+
13
+ train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
14
+ test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
15
+
16
+
17
+ dataset1: Co3d_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_co3d/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
18
+ dataset2: WildRGBD_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
19
+
20
+ dataset3: ARKitScenes_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
21
+ dataset4: ARKitScenesHighRes_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
22
+ dataset5: ScanNetpp_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
23
+ dataset6: ScanNet_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannet/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
24
+ dataset7: HyperSim_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_hypersim", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
25
+
26
+ dataset8: BlendedMVS_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
27
+ dataset9: MegaDepth_Multi(allow_repeat=False, split="train", ROOT="../../data/dust3r_data/processed_megadepth", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
28
+ dataset10: MapFree_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_mapfree/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
29
+ dataset11: Waymo_Multi(allow_repeat=False, split=None, ROOT="../../data/dust3r_data/processed_waymo/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
30
+ dataset12: VirtualKITTI2_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_vkitti", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
31
+ dataset13: UnReal4K_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
32
+ dataset14: TartanAir_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_tartanair/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
33
+
34
+ dataset15: DL3DV_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dl3dv", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
35
+
36
+ dataset16: Cop3D_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_cop3d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
37
+ dataset17: MVImgNet_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvimgnet/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
38
+ dataset18: RE10K_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_re10k/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
39
+ dataset19: OmniObject3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_omniobject3d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
40
+
41
+ dataset20: ThreeDKenBurns(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_3dkb/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
42
+ dataset21: IRS(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_irs/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
43
+ dataset22: SynScapes(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_synscapes/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
44
+ dataset23: UrbanSyn(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_urbansyn/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
45
+ dataset24: EDEN_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_eden", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
46
+ dataset25: SmartPortraits_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_smartportraits", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
47
+
48
+ dataset26: DynamicReplica(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
49
+ dataset27: Spring(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_spring/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
50
+ dataset28: BEDLAM_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_bedlam", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
51
+ dataset29: MVS_Synth_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvs_synth", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
52
+ dataset30: PointOdyssey_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_point_odyssey", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
53
+
54
+ dataset31: UASOL_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_uasol", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
55
+ dataset32: MP3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_mp3d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
56
+
57
+ dataset33: HOI4D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_hoi4d/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
58
+
59
+ train_dataset: 44800 @ ${dataset1} + 56000 @ ${dataset2} + 56000 @ ${dataset3} + 5600 @ ${dataset4} + 5600 @ ${dataset5} + 140000 @ ${dataset6} + 5600 @ ${dataset7} + 22400 @ ${dataset8} + 16800 @ ${dataset9} + 56000 @ ${dataset10} + 42000 @ ${dataset11} + 5600 @ ${dataset12} + 168 @ ${dataset13} + 84000 @ ${dataset14} + 84000 @ ${dataset15} + 7200 @ ${dataset16} + 19200 @ ${dataset17} + 9600 @ ${dataset18} + 24000 @ ${dataset19} + 33600 @ ${dataset26} + 2400 @ ${dataset27} + 9600 @ ${dataset28} + 4800 @ ${dataset29} + 28800 @ ${dataset30} + 14400 @ ${dataset31} + 19200 @ ${dataset32}
60
+
61
+
62
+ test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
63
+
64
+ seed: 0
65
+ batch_size: 6
66
+ accum_iter: 2
67
+ gradient_checkpointing: False
68
+ epochs: 10
69
+ start_epoch: 0
70
+ weight_decay: 0.05
71
+ lr: 1e-6
72
+ min_lr: 1e-7
73
+ warmup_epochs: 0.5
74
+ amp: 1
75
+
76
+ num_workers: 16
77
+ world_size: 1
78
+ local-rank: -1
79
+ dist_url: 'env://'
80
+ rank: 0
81
+ gpu: 0
82
+ distributed: False
83
+ dist_backend: 'nccl'
84
+
85
+ eval_freq: 1
86
+ save_freq: 0.1
87
+ keep_freq: 1
88
+ print_freq: 10
89
+ print_img_freq: 50000000
90
+ num_imgs_vis: 4
91
+ save_dir: 'checkpoints'
92
+ exp_name: 'linear_224_fixed_16'
93
+ task: 'cut3r'
94
+ logdir: ./${save_dir}/${exp_name}/logs
95
+ output_dir: ./${save_dir}/${exp_name}/
96
+ hydra:
97
+ verbose: True
98
+ run:
99
+ dir: ./${save_dir}/${exp_name}
extern/CUT3R/config/stage1.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: "ARCroco3DStereo(ARCroco3DStereoConfig(state_size=768, state_pe='2d', pos_embed='RoPE100', rgb_head=True, pose_head=True, img_size=(224, 224), head_type='linear', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12))"
2
+ pretrained: null
3
+ load_only_encoder: False
4
+ long_context: False
5
+ fixed_length: True
6
+ resume: null
7
+ benchmark: True
8
+ num_views : 4
9
+ num_test_views : 4
10
+ n_corres_train: 0
11
+ n_corres_test: 0
12
+
13
+ train_criterion: ConfLoss(Regr3DPose(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
14
+ test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
15
+
16
+ dataset1: Co3d_Multi(split='train', ROOT='../../data/dust3r_data/processed_co3d/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
17
+ dataset2: WildRGBD_Multi(split='train', ROOT="../../data/dust3r_data/processed_wildrgbd", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
18
+
19
+ dataset3: ARKitScenes_Multi(split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/', aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
20
+ dataset4: ARKitScenesHighRes_Multi(split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
21
+ dataset5: ScanNetpp_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannetpp/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
22
+ dataset6: ScanNet_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannet/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
23
+ dataset7: HyperSim_Multi(split='train', ROOT="../../data/custom_data/processed_hypersim", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
24
+
25
+ dataset8: BlendedMVS_Multi(split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
26
+ dataset9: MegaDepth_Multi(split="train", ROOT="../../data/dust3r_data/processed_megadepth", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
27
+ dataset10: MapFree_Multi(split=None, ROOT="../../data/mast3r_data/processed_mapfree/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
28
+ dataset11: Waymo_Multi(split=None, ROOT="../../data/dust3r_data/processed_waymo/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
29
+ dataset12: VirtualKITTI2_Multi(split=None, ROOT="../../data/mast3r_data/processed_vkitti", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
30
+ dataset13: UnReal4K_Multi(split=None, ROOT="../../data/mast3r_data/processed_unreal4k/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
31
+ dataset14: TartanAir_Multi(split=None, ROOT="../../data/mast3r_data/processed_tartanair/", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
32
+
33
+ dataset15: DL3DV_Multi(split='train', ROOT="../../data/custom_data/processed_dl3dv", aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
34
+
35
+ train_dataset: 32000 @ ${dataset1} + 48000 @ ${dataset2} + 100800 @ ${dataset3} + 56000 @ ${dataset4} + 33600 @ ${dataset5} + 56000 @ ${dataset6} + 33600 @ ${dataset7} + 33600 @ ${dataset8} + 33600 @ ${dataset9} + 100800 @ ${dataset10} + 78400 @ ${dataset11} + 5000 @ ${dataset12} + 1000 @ ${dataset13} + 33600 @ ${dataset14} + 160000 @ ${dataset15}
36
+ test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
37
+
38
+
39
+ seed: 0
40
+ batch_size: 16
41
+ accum_iter: 1
42
+ gradient_checkpointing: False
43
+ epochs: 100
44
+ start_epoch: 0
45
+ weight_decay: 0.05
46
+ lr: 1e-4
47
+ min_lr: 1e-6
48
+ warmup_epochs: 10
49
+ amp: 1
50
+
51
+ num_workers: 8
52
+ world_size: 1
53
+ local-rank: -1
54
+ dist_url: 'env://'
55
+ rank: 0
56
+ gpu: 0
57
+ distributed: False
58
+ dist_backend: 'nccl'
59
+
60
+ eval_freq: 1
61
+ save_freq: 1
62
+ keep_freq: 10
63
+ print_freq: 10
64
+ print_img_freq: 500
65
+ num_imgs_vis: 4
66
+ save_dir: 'checkpoints'
67
+ exp_name: 'train_first_stage'
68
+ task: 'cut3r'
69
+ logdir: ./${save_dir}/${exp_name}/logs
70
+ output_dir: ./${save_dir}/${exp_name}/
71
+ hydra:
72
+ verbose: True
73
+ run:
74
+ dir: ./${save_dir}/${exp_name}
extern/CUT3R/config/stage2.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: ARCroco3DStereo(ARCroco3DStereoConfig(state_size=768, state_pe='2d', pos_embed='RoPE100',
2
+ rgb_head=True, pose_head=True, img_size=(224, 224), head_type='linear', output_mode='pts3d+pose',
3
+ depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf,
4
+ inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12,
5
+ dec_num_heads=12))
6
+ pretrained: checkpoints/train_first_stage/checkpoint-final.pth
7
+ load_only_encoder: False
8
+ long_context: False
9
+ fixed_length: True
10
+ resume: null
11
+ benchmark: True
12
+ num_views : 4
13
+ num_test_views : 4
14
+ n_corres_train: 0
15
+ n_corres_test: 0
16
+
17
+
18
+ train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
19
+ test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
20
+
21
+
22
+ dataset1: Co3d_Multi(split='train', ROOT='../../data/dust3r_data/processed_co3d/',
23
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
24
+ dataset2: WildRGBD_Multi(split='train', ROOT="../../data/dust3r_data/processed_wildrgbd",
25
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
26
+ dataset3: ARKitScenes_Multi(split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/',
27
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
28
+ dataset4: ARKitScenesHighRes_Multi(split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres",
29
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
30
+ dataset5: ScanNetpp_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannetpp/",
31
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
32
+ dataset6: ScanNet_Multi(split='train', ROOT="../../data/dust3r_data/processed_scannet/",
33
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
34
+ dataset7: HyperSim_Multi(split='train', ROOT="../../data/custom_data/processed_hypersim",
35
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
36
+ dataset8: BlendedMVS_Multi(split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/",
37
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
38
+ dataset9: MegaDepth_Multi(split="train", ROOT="../../data/dust3r_data/processed_megadepth",
39
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
40
+ dataset10: MapFree_Multi(split=None, ROOT="../../data/mast3r_data/processed_mapfree/",
41
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
42
+ dataset11: Waymo_Multi(split=None, ROOT="../../data/dust3r_data/processed_waymo/",
43
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
44
+ dataset12: VirtualKITTI2_Multi(split=None, ROOT="../../data/mast3r_data/processed_vkitti",
45
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
46
+ dataset13: UnReal4K_Multi(split=None, ROOT="../../data/mast3r_data/processed_unreal4k/",
47
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
48
+ dataset14: TartanAir_Multi(split=None, ROOT="../../data/mast3r_data/processed_tartanair/",
49
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
50
+ dataset15: DL3DV_Multi(split='train', ROOT="../../data/custom_data/processed_dl3dv",
51
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
52
+ dataset16: Cop3D_Multi(split='train', ROOT="../../data/custom_data/processed_cop3d/",
53
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
54
+ dataset17: MVImgNet_Multi(split='train', ROOT="../../data/custom_data/processed_mvimgnet/",
55
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
56
+ dataset18: RE10K_Multi(split=None, ROOT="../../data/custom_data/processed_re10k/",
57
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
58
+ dataset19: OmniObject3D_Multi(split=None, ROOT="../../data/custom_data/processed_omniobject3d/",
59
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
60
+ dataset20: ThreeDKenBurns(split=None, ROOT="../../data/custom_data/processed_3dkb/",
61
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
62
+ dataset21: IRS(split=None, ROOT="../../data/custom_data/processed_irs/", aug_crop=16,
63
+ resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
64
+ dataset22: SynScapes(split=None, ROOT="../../data/custom_data/processed_synscapes/",
65
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
66
+ dataset23: UrbanSyn(split=None, ROOT="../../data/custom_data/processed_urbansyn/",
67
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
68
+ dataset24: EDEN_Multi(split='train', ROOT="../../data/custom_data/processed_eden",
69
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
70
+ dataset25: SmartPortraits_Multi(split='train', ROOT="../../data/custom_data/processed_smartportraits",
71
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
72
+ dataset26: DynamicReplica(split='train', ROOT="../../data/custom_data/processed_dynamic_replica/",
73
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
74
+ dataset27: Spring(split=None, ROOT="../../data/custom_data/processed_spring/", aug_crop=16,
75
+ resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
76
+ dataset28: BEDLAM_Multi(split='train', ROOT="../../data/custom_data/processed_bedlam",
77
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
78
+ dataset29: MVS_Synth_Multi(split='train', ROOT="../../data/custom_data/processed_mvs_synth",
79
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
80
+ dataset30: PointOdyssey_Multi(split='train', ROOT="../../data/custom_data/processed_point_odyssey",
81
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
82
+ dataset31: UASOL_Multi(split='train', ROOT="../../data/custom_data/processed_uasol",
83
+ aug_crop=16, resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
84
+ dataset32: MP3D_Multi(split=None, ROOT="../../data/custom_data/processed_mp3d/", aug_crop=16,
85
+ resolution=224, transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
86
+ train_dataset: 48000 @ ${dataset1} + 60000 @ ${dataset2} + 54000 @ ${dataset3} + 18000
87
+ @ ${dataset4} + 6000 @ ${dataset5} + 42000 @ ${dataset6} + 12000 @ ${dataset7} +
88
+ 6000 @ ${dataset8} + 6000 @ ${dataset9} + 60000 @ ${dataset10} + 48000 @ ${dataset11}
89
+ + 2400 @ ${dataset12} + 180 @ ${dataset13} + 18000 @ ${dataset14} + 222000 @ ${dataset15}
90
+ + 400 @ ${dataset16} + 16000 @ ${dataset17} + 4000 @ ${dataset18} + 32000 @ ${dataset19}
91
+ + 4000 @ ${dataset20} + 2000 @ ${dataset21} + 2000 @ ${dataset22} + 500 @ ${dataset23}
92
+ + 12000 @ ${dataset24} + 16000 @ ${dataset25} + 20000 @ ${dataset26} + 400 @ ${dataset27}
93
+ + 32000 @ ${dataset28} + 2000 @ ${dataset29} + 20000 @ ${dataset30} + 12000 @ ${dataset31}
94
+ + 24000 @ ${dataset32}
95
+ test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
96
+
97
+ seed: 0
98
+ batch_size: 16
99
+ accum_iter: 1
100
+ gradient_checkpointing: false
101
+ epochs: 35
102
+ start_epoch: 0
103
+ weight_decay: 0.05
104
+ lr: 5.0e-06
105
+ min_lr: 1.0e-06
106
+ warmup_epochs: 1
107
+ amp: 1
108
+
109
+ num_workers: 8
110
+ world_size: 1
111
+ local-rank: -1
112
+ dist_url: 'env://'
113
+ rank: 0
114
+ gpu: 0
115
+ distributed: False
116
+ dist_backend: 'nccl'
117
+
118
+ eval_freq: 1
119
+ save_freq: 1
120
+ keep_freq: 10
121
+ print_freq: 10
122
+ print_img_freq: 500
123
+ num_imgs_vis: 4
124
+ save_dir: 'checkpoints'
125
+ exp_name: 'train_second_stage'
126
+ task: 'cut3r'
127
+ logdir: ./${save_dir}/${exp_name}/logs
128
+ output_dir: ./${save_dir}/${exp_name}/
129
+ hydra:
130
+ verbose: True
131
+ run:
132
+ dir: ./${save_dir}/${exp_name}
extern/CUT3R/config/stage3.yaml ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: ARCroco3DStereo(ARCroco3DStereoConfig(state_size=768, state_pe='2d', pos_embed='RoPE100',
2
+ rgb_head=True, pose_head=True, patch_embed_cls='ManyAR_PatchEmbed', img_size=(512,
3
+ 512), head_type='dpt', output_mode='pts3d+pose', depth_mode=('exp', -inf, inf),
4
+ conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024, enc_depth=24,
5
+ enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12, landscape_only=False))
6
+ pretrained: checkpoints/train_second_stage/checkpoint-final.pth
7
+ load_only_encoder: False
8
+ long_context: False
9
+ fixed_length: True
10
+ resume: null
11
+ benchmark: True
12
+ num_views : 4
13
+ num_test_views : 4
14
+ n_corres_train: 0
15
+ n_corres_test: 0
16
+
17
+
18
+ train_criterion: ConfLoss(Regr3DPoseBatchList(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
19
+ test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0) + RGBLoss(L21)
20
+
21
+ resolution:
22
+ - (512
23
+ - 384)
24
+ - (512
25
+ - 336)
26
+ - (512
27
+ - 288)
28
+ - (512
29
+ - 256)
30
+ - (512
31
+ - 208)
32
+ - (512
33
+ - 144)
34
+ - (384
35
+ - 512)
36
+ - (336
37
+ - 512)
38
+ - (288
39
+ - 512)
40
+ - (256
41
+ - 512)
42
+ dataset1: Co3d_Multi(allow_repeat=True, split='train', ROOT='../../data/dust3r_data/processed_co3d/',
43
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
44
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
45
+ num_views=${num_views}, n_corres=${n_corres_train})
46
+ dataset2: WildRGBD_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd",
47
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
48
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
49
+ num_views=${num_views}, n_corres=${n_corres_train})
50
+ dataset3: ARKitScenes_Multi(allow_repeat=True, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/',
51
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
52
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
53
+ num_views=${num_views}, n_corres=${n_corres_train})
54
+ dataset4: ARKitScenesHighRes_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres",
55
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
56
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
57
+ num_views=${num_views}, n_corres=${n_corres_train})
58
+ dataset5: ScanNetpp_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/",
59
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
60
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
61
+ num_views=${num_views}, n_corres=${n_corres_train})
62
+ dataset6: ScanNet_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_scannet/",
63
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
64
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
65
+ num_views=${num_views}, n_corres=${n_corres_train})
66
+ dataset7: HyperSim_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_hypersim",
67
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
68
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
69
+ num_views=${num_views}, n_corres=${n_corres_train})
70
+ dataset8: BlendedMVS_Multi(allow_repeat=True, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/",
71
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
72
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
73
+ num_views=${num_views}, n_corres=${n_corres_train})
74
+ dataset9: MegaDepth_Multi(allow_repeat=True, split="train", ROOT="../../data/dust3r_data/processed_megadepth",
75
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
76
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
77
+ num_views=${num_views}, n_corres=${n_corres_train})
78
+ dataset10: MapFree_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_mapfree/",
79
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
80
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
81
+ num_views=${num_views}, n_corres=${n_corres_train})
82
+ dataset11: Waymo_Multi(allow_repeat=True, split=None, ROOT="../../data/dust3r_data/processed_waymo/",
83
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
84
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
85
+ num_views=${num_views}, n_corres=${n_corres_train})
86
+ dataset12: VirtualKITTI2_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_vkitti",
87
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
88
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
89
+ num_views=${num_views}, n_corres=${n_corres_train})
90
+ dataset13: UnReal4K_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/",
91
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
92
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
93
+ num_views=${num_views}, n_corres=${n_corres_train})
94
+ dataset14: TartanAir_Multi(allow_repeat=True, split=None, ROOT="../../data/mast3r_data/processed_tartanair/",
95
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
96
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
97
+ num_views=${num_views}, n_corres=${n_corres_train})
98
+ dataset15: DL3DV_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_dl3dv",
99
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
100
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
101
+ num_views=${num_views}, n_corres=${n_corres_train})
102
+ dataset16: Cop3D_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_cop3d/",
103
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
104
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
105
+ num_views=${num_views}, n_corres=${n_corres_train})
106
+ dataset17: MVImgNet_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_mvimgnet/",
107
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
108
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
109
+ num_views=${num_views}, n_corres=${n_corres_train})
110
+ dataset18: RE10K_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_re10k/",
111
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
112
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
113
+ num_views=${num_views}, n_corres=${n_corres_train})
114
+ dataset19: OmniObject3D_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_omniobject3d/",
115
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
116
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
117
+ num_views=${num_views}, n_corres=${n_corres_train})
118
+ dataset20: ThreeDKenBurns(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_3dkb/",
119
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
120
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
121
+ num_views=${num_views}, n_corres=${n_corres_train})
122
+ dataset21: IRS(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_irs/",
123
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
124
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
125
+ num_views=${num_views}, n_corres=${n_corres_train})
126
+ dataset22: SynScapes(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_synscapes/",
127
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
128
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
129
+ num_views=${num_views}, n_corres=${n_corres_train})
130
+ dataset23: UrbanSyn(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_urbansyn/",
131
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
132
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
133
+ num_views=${num_views}, n_corres=${n_corres_train})
134
+ dataset24: EDEN_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_eden",
135
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
136
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
137
+ num_views=${num_views}, n_corres=${n_corres_train})
138
+ dataset25: SmartPortraits_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_smartportraits",
139
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
140
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
141
+ num_views=${num_views}, n_corres=${n_corres_train})
142
+ dataset26: DynamicReplica(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/",
143
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
144
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
145
+ num_views=${num_views}, n_corres=${n_corres_train})
146
+ dataset27: Spring(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_spring/",
147
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
148
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
149
+ num_views=${num_views}, n_corres=${n_corres_train})
150
+ dataset28: BEDLAM_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_bedlam",
151
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
152
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
153
+ num_views=${num_views}, n_corres=${n_corres_train})
154
+ dataset29: MVS_Synth_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_mvs_synth",
155
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
156
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
157
+ num_views=${num_views}, n_corres=${n_corres_train})
158
+ dataset30: PointOdyssey_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_point_odyssey",
159
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
160
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
161
+ num_views=${num_views}, n_corres=${n_corres_train})
162
+ dataset31: UASOL_Multi(allow_repeat=True, split='train', ROOT="../../data/custom_data/processed_uasol",
163
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
164
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
165
+ num_views=${num_views}, n_corres=${n_corres_train})
166
+ dataset32: MP3D_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_mp3d/",
167
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
168
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
169
+ num_views=${num_views}, n_corres=${n_corres_train})
170
+ dataset33: HOI4D_Multi(allow_repeat=True, split=None, ROOT="../../data/custom_data/processed_hoi4d/",
171
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
172
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
173
+ num_views=${num_views}, n_corres=${n_corres_train})
174
+ train_dataset: 44800 @ ${dataset1} + 56000 @ ${dataset2} + 56000 @ ${dataset3} + 22400
175
+ @ ${dataset4} + 16800 @ ${dataset5} + 38400 @ ${dataset6} + 11200 @ ${dataset7}
176
+ + 22400 @ ${dataset8} + 22400 @ ${dataset9} + 84000 @ ${dataset10} + 20000 @ ${dataset11}
177
+ + 5600 @ ${dataset12} + 168 @ ${dataset13} + 56000 @ ${dataset14} + 74000 @ ${dataset15}
178
+ + 480 @ ${dataset16} + 19200 @ ${dataset17} + 4800 @ ${dataset18} + 4800 @ ${dataset20}
179
+ + 2400 @ ${dataset21} + 2400 @ ${dataset22} + 600 @ ${dataset23} + 19200 @ ${dataset25}
180
+ + 36000 @ ${dataset26} + 9400 @ ${dataset27} + 36000 @ ${dataset28} + 1400 @ ${dataset29}
181
+ + 7200 @ ${dataset30} + 14400 @ ${dataset31} + 28800 @ ${dataset32} + 12000 @ ${dataset33}
182
+ test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
183
+
184
+ seed: 0
185
+ batch_size: 16
186
+ accum_iter: 1
187
+ gradient_checkpointing: true
188
+ epochs: 40
189
+ start_epoch: 0
190
+ weight_decay: 0.05
191
+ lr: 1.0e-05
192
+ min_lr: 1.0e-06
193
+ warmup_epochs: 2
194
+ amp: 1
195
+
196
+ num_workers: 8
197
+ world_size: 1
198
+ local-rank: -1
199
+ dist_url: 'env://'
200
+ rank: 0
201
+ gpu: 0
202
+ distributed: False
203
+ dist_backend: 'nccl'
204
+
205
+ eval_freq: 1
206
+ save_freq: 1
207
+ keep_freq: 10
208
+ print_freq: 10
209
+ print_img_freq: 500
210
+ num_imgs_vis: 4
211
+ save_dir: 'checkpoints'
212
+ exp_name: 'train_third_stage'
213
+ task: 'cut3r'
214
+ logdir: ./${save_dir}/${exp_name}/logs
215
+ output_dir: ./${save_dir}/${exp_name}/
216
+ hydra:
217
+ verbose: True
218
+ run:
219
+ dir: ./${save_dir}/${exp_name}
extern/CUT3R/config/stage4.yaml ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: ARCroco3DStereo(ARCroco3DStereoConfig(freeze='encoder', state_size=768, state_pe='2d',
2
+ pos_embed='RoPE100', rgb_head=True, pose_head=True, patch_embed_cls='ManyAR_PatchEmbed',
3
+ img_size=(512, 512), head_type='dpt', output_mode='pts3d+pose', depth_mode=('exp',
4
+ -inf, inf), conf_mode=('exp', 1, inf), pose_mode=('exp', -inf, inf), enc_embed_dim=1024,
5
+ enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12,
6
+ landscape_only=False))
7
+ pretrained: checkpoints/train_third_stage/checkpoint-final.pth
8
+ load_only_encoder: False
9
+ long_context: True
10
+ fixed_length: True
11
+ resume: null
12
+ benchmark: True
13
+ num_views : 32
14
+ num_test_views : 4
15
+ n_corres_train: 0
16
+ n_corres_test: 0
17
+
18
+ train_criterion: ConfLoss(Regr3DPose(L21, norm_mode='?avg_dis'), alpha=0.2) + RGBLoss(MSE)
19
+ test_criterion: Regr3DPose(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0)
20
+ + Regr3DPose_ScaleInv(L21, norm_mode='?avg_dis', gt_scale=True, sky_loss_value=0)
21
+ + RGBLoss(L21)
22
+ resolution:
23
+ - (512
24
+ - 384)
25
+ - (512
26
+ - 336)
27
+ - (512
28
+ - 288)
29
+ - (512
30
+ - 256)
31
+ - (512
32
+ - 208)
33
+ - (512
34
+ - 144)
35
+ - (384
36
+ - 512)
37
+ - (336
38
+ - 512)
39
+ - (288
40
+ - 512)
41
+ - (256
42
+ - 512)
43
+ dataset1: Co3d_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_co3d/',
44
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
45
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
46
+ num_views=${num_views}, n_corres=${n_corres_train})
47
+ dataset2: WildRGBD_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_wildrgbd",
48
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
49
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
50
+ num_views=${num_views}, n_corres=${n_corres_train})
51
+ dataset3: ARKitScenes_Multi(allow_repeat=False, split='train', ROOT='../../data/dust3r_data/processed_arkitscenes/',
52
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
53
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
54
+ num_views=${num_views}, n_corres=${n_corres_train})
55
+ dataset4: ARKitScenesHighRes_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_arkitscenes_highres",
56
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
57
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
58
+ num_views=${num_views}, n_corres=${n_corres_train})
59
+ dataset5: ScanNetpp_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannetpp/",
60
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
61
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
62
+ num_views=${num_views}, n_corres=${n_corres_train})
63
+ dataset6: ScanNet_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_scannet/",
64
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
65
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
66
+ num_views=${num_views}, n_corres=${n_corres_train})
67
+ dataset7: HyperSim_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_hypersim",
68
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
69
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
70
+ num_views=${num_views}, n_corres=${n_corres_train})
71
+ dataset8: BlendedMVS_Multi(allow_repeat=False, split='train', ROOT="../../data/dust3r_data/processed_blendedmvs/",
72
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
73
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
74
+ num_views=${num_views}, n_corres=${n_corres_train})
75
+ dataset9: MegaDepth_Multi(allow_repeat=False, split="train", ROOT="../../data/dust3r_data/processed_megadepth",
76
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
77
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
78
+ num_views=${num_views}, n_corres=${n_corres_train})
79
+ dataset10: MapFree_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_mapfree/",
80
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
81
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
82
+ num_views=${num_views}, n_corres=${n_corres_train})
83
+ dataset11: Waymo_Multi(allow_repeat=False, split=None, ROOT="../../data/dust3r_data/processed_waymo/",
84
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
85
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
86
+ num_views=${num_views}, n_corres=${n_corres_train})
87
+ dataset12: VirtualKITTI2_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_vkitti",
88
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
89
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
90
+ num_views=${num_views}, n_corres=${n_corres_train})
91
+ dataset13: UnReal4K_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_unreal4k/",
92
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
93
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
94
+ num_views=${num_views}, n_corres=${n_corres_train})
95
+ dataset14: TartanAir_Multi(allow_repeat=False, split=None, ROOT="../../data/mast3r_data/processed_tartanair/",
96
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
97
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
98
+ num_views=${num_views}, n_corres=${n_corres_train})
99
+ dataset15: DL3DV_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dl3dv",
100
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
101
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
102
+ num_views=${num_views}, n_corres=${n_corres_train})
103
+ dataset16: Cop3D_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_cop3d/",
104
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
105
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
106
+ num_views=${num_views}, n_corres=${n_corres_train})
107
+ dataset17: MVImgNet_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvimgnet/",
108
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
109
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
110
+ num_views=${num_views}, n_corres=${n_corres_train})
111
+ dataset18: RE10K_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_re10k/",
112
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
113
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
114
+ num_views=${num_views}, n_corres=${n_corres_train})
115
+ dataset19: OmniObject3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_omniobject3d/",
116
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
117
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
118
+ num_views=${num_views}, n_corres=${n_corres_train})
119
+ dataset20: ThreeDKenBurns(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_3dkb/",
120
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
121
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
122
+ num_views=${num_views}, n_corres=${n_corres_train})
123
+ dataset21: IRS(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_irs/",
124
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
125
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
126
+ num_views=${num_views}, n_corres=${n_corres_train})
127
+ dataset22: SynScapes(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_synscapes/",
128
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
129
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
130
+ num_views=${num_views}, n_corres=${n_corres_train})
131
+ dataset23: UrbanSyn(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_urbansyn/",
132
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
133
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
134
+ num_views=${num_views}, n_corres=${n_corres_train})
135
+ dataset24: EDEN_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_eden",
136
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
137
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
138
+ num_views=${num_views}, n_corres=${n_corres_train})
139
+ dataset25: SmartPortraits_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_smartportraits",
140
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
141
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
142
+ num_views=${num_views}, n_corres=${n_corres_train})
143
+ dataset26: DynamicReplica(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_dynamic_replica/",
144
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
145
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
146
+ num_views=${num_views}, n_corres=${n_corres_train})
147
+ dataset27: Spring(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_spring/",
148
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
149
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
150
+ num_views=${num_views}, n_corres=${n_corres_train})
151
+ dataset28: BEDLAM_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_bedlam",
152
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
153
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
154
+ num_views=${num_views}, n_corres=${n_corres_train})
155
+ dataset29: MVS_Synth_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_mvs_synth",
156
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
157
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
158
+ num_views=${num_views}, n_corres=${n_corres_train})
159
+ dataset30: PointOdyssey_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_point_odyssey",
160
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
161
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
162
+ num_views=${num_views}, n_corres=${n_corres_train})
163
+ dataset31: UASOL_Multi(allow_repeat=False, split='train', ROOT="../../data/custom_data/processed_uasol",
164
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
165
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
166
+ num_views=${num_views}, n_corres=${n_corres_train})
167
+ dataset32: MP3D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_mp3d/",
168
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
169
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
170
+ num_views=${num_views}, n_corres=${n_corres_train})
171
+ dataset33: HOI4D_Multi(allow_repeat=False, split=None, ROOT="../../data/custom_data/processed_hoi4d/",
172
+ aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 208),
173
+ (512, 144), (384, 512), (336, 512), (288, 512), (256, 512)], transform=SeqColorJitter,
174
+ num_views=${num_views}, n_corres=${n_corres_train})
175
+ train_dataset: 22400 @ ${dataset1} + 28000 @ ${dataset2} + 28000 @ ${dataset3} + 2800
176
+ @ ${dataset4} + 2800 @ ${dataset5} + 70000 @ ${dataset6} + 2800 @ ${dataset7} +
177
+ 11200 @ ${dataset8} + 8400 @ ${dataset9} + 28000 @ ${dataset10} + 21000 @ ${dataset11}
178
+ + 2800 @ ${dataset12} + 84 @ ${dataset13} + 42000 @ ${dataset14} + 42000 @ ${dataset15}
179
+ + 3600 @ ${dataset16} + 9600 @ ${dataset17} + 4800 @ ${dataset18} + 12000 @ ${dataset19}
180
+ + 16800 @ ${dataset26} + 1200 @ ${dataset27} + 4800 @ ${dataset28} + 2400 @ ${dataset29}
181
+ + 14400 @ ${dataset30} + 7200 @ ${dataset31} + 9600 @ ${dataset32}
182
+ test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../../data/dust3r_data/processed_arkitscenes/', resolution=224, num_views=4, seed=42, n_corres=0)
183
+
184
+ seed: 0
185
+ batch_size: 16
186
+ accum_iter: 1
187
+ gradient_checkpointing: true
188
+ epochs: 10
189
+ start_epoch: 0
190
+ weight_decay: 0.05
191
+ lr: 1.0e-06
192
+ min_lr: 1.0e-07
193
+ warmup_epochs: 0.5
194
+ amp: 1
195
+
196
+ num_workers: 8
197
+ world_size: 1
198
+ local-rank: -1
199
+ dist_url: 'env://'
200
+ rank: 0
201
+ gpu: 0
202
+ distributed: False
203
+ dist_backend: 'nccl'
204
+
205
+ eval_freq: 1
206
+ save_freq: 1
207
+ keep_freq: 10
208
+ print_freq: 10
209
+ print_img_freq: 500
210
+ num_imgs_vis: 4
211
+ save_dir: 'checkpoints'
212
+ exp_name: 'train_final_stage'
213
+ task: 'cut3r'
214
+ logdir: ./${save_dir}/${exp_name}/logs
215
+ output_dir: ./${save_dir}/${exp_name}/
216
+ hydra:
217
+ verbose: True
218
+ run:
219
+ dir: ./${save_dir}/${exp_name}
extern/CUT3R/datasets_preprocess/custom_convert2TUM.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import shutil
4
+ import numpy as np
5
+ import cv2 as cv
6
+ import imageio
7
+ from tqdm import tqdm
8
+ from concurrent.futures import ProcessPoolExecutor, as_completed
9
+ import open3d as o3d
10
+ import scipy.ndimage
11
+ import pickle
12
+
13
+ # Set environment variable to limit OpenBLAS threads
14
+ os.environ["OPENBLAS_NUM_THREADS"] = "1"
15
+
16
+ DEPTH_SCALE_FACTOR = 5000
17
+
18
+
19
+ # Point cloud from depth
20
+ def pointcloudify_depth(depth, intrinsics, dist_coeff, undistort=True):
21
+ shape = depth.shape[::-1]
22
+
23
+ if undistort:
24
+ undist_intrinsics, _ = cv.getOptimalNewCameraMatrix(
25
+ intrinsics, dist_coeff, shape, 1, shape
26
+ )
27
+ inv_undist_intrinsics = np.linalg.inv(undist_intrinsics)
28
+
29
+ map_x, map_y = cv.initUndistortRectifyMap(
30
+ intrinsics, dist_coeff, None, undist_intrinsics, shape, cv.CV_32FC1
31
+ )
32
+ undist_depth = cv.remap(depth, map_x, map_y, cv.INTER_NEAREST)
33
+ else:
34
+ inv_undist_intrinsics = np.linalg.inv(intrinsics)
35
+ undist_depth = depth
36
+
37
+ # Generate x,y grid for H x W image
38
+ grid_x, grid_y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]))
39
+ grid = np.stack((grid_x, grid_y, np.ones_like(grid_x)), axis=-1)
40
+
41
+ # Reshape and compute local grid
42
+ grid_flat = grid.reshape(-1, 3).T
43
+ local_grid = inv_undist_intrinsics @ grid_flat
44
+
45
+ # Multiply by depth
46
+ local_grid = local_grid.T * undist_depth.reshape(-1, 1)
47
+
48
+ return local_grid.astype(np.float32)
49
+
50
+
51
+ def project_pcd_to_depth(pcd, undist_intrinsics, img_size, config):
52
+ h, w = img_size
53
+ points = np.asarray(pcd.points)
54
+ d = points[:, 2]
55
+ normalized_points = points / points[:, 2][:, np.newaxis]
56
+ proj_pcd = np.round((undist_intrinsics @ normalized_points.T).T).astype(np.int64)
57
+ proj_mask = (
58
+ (proj_pcd[:, 0] >= 0)
59
+ & (proj_pcd[:, 0] < w)
60
+ & (proj_pcd[:, 1] >= 0)
61
+ & (proj_pcd[:, 1] < h)
62
+ )
63
+ proj_pcd = proj_pcd[proj_mask]
64
+ d = d[proj_mask]
65
+ pcd_image = np.zeros((config["res_h"], config["res_w"]), dtype=np.float32)
66
+ pcd_image[proj_pcd[:, 1], proj_pcd[:, 0]] = d
67
+ return pcd_image
68
+
69
+
70
+ def smooth_depth(depth):
71
+ MAX_DEPTH_VAL = 1e5
72
+ KERNEL_SIZE = 11
73
+ depth = depth.copy()
74
+ depth[depth == 0] = MAX_DEPTH_VAL
75
+ smoothed_depth = scipy.ndimage.minimum_filter(depth, KERNEL_SIZE)
76
+ smoothed_depth[smoothed_depth == MAX_DEPTH_VAL] = 0
77
+ return smoothed_depth
78
+
79
+
80
+ def align_rgb_depth(rgb, depth, roi, config, rgb_cnf, config_dict, T):
81
+ # Undistort rgb image
82
+ undist_rgb = cv.undistort(
83
+ rgb,
84
+ rgb_cnf["intrinsics"],
85
+ rgb_cnf["dist_coeff"],
86
+ None,
87
+ rgb_cnf["undist_intrinsics"],
88
+ )
89
+
90
+ # Create point cloud from depth
91
+ pcd = o3d.geometry.PointCloud()
92
+ points = pointcloudify_depth(
93
+ depth, config_dict["depth"]["dist_mtx"], config_dict["depth"]["dist_coef"]
94
+ )
95
+ pcd.points = o3d.utility.Vector3dVector(points)
96
+ # Align point cloud with depth reference frame
97
+ pcd.transform(T)
98
+
99
+ # Project aligned point cloud to rgb
100
+ aligned_depth = project_pcd_to_depth(
101
+ pcd, rgb_cnf["undist_intrinsics"], rgb.shape[:2], config
102
+ )
103
+
104
+ smoothed_aligned_depth = smooth_depth(aligned_depth)
105
+ x, y, w, h = roi
106
+
107
+ depth_res = smoothed_aligned_depth[y : y + h, x : x + w]
108
+ rgb_res = undist_rgb[y : y + h, x : x + w]
109
+ return rgb_res, depth_res, rgb_cnf["undist_intrinsics"]
110
+
111
+
112
+ def process_pair(args):
113
+ (
114
+ pair,
115
+ smartphone_folder,
116
+ azure_depth_folder,
117
+ final_folder,
118
+ config,
119
+ rgb_cnf,
120
+ config_dict,
121
+ T,
122
+ ) = args
123
+ try:
124
+ rgb_image = cv.imread(os.path.join(smartphone_folder, f"{pair[0]}.png"))
125
+ depth_array = np.load(
126
+ os.path.join(azure_depth_folder, f"{pair[1]}.npy"), allow_pickle=True
127
+ )
128
+
129
+ rgb_image_aligned, depth_array_aligned, intrinsics = align_rgb_depth(
130
+ rgb_image,
131
+ depth_array,
132
+ (0, 0, config["res_w"], config["res_h"]),
133
+ config,
134
+ rgb_cnf,
135
+ config_dict,
136
+ T,
137
+ )
138
+ # Save rgb as 8-bit png
139
+ cv.imwrite(
140
+ os.path.join(final_folder, "rgb", f"{pair[0]}.png"), rgb_image_aligned
141
+ )
142
+
143
+ # # Save depth as 16-bit unsigned int with scale factor
144
+ # depth_array_aligned = (depth_array_aligned *
145
+ # DEPTH_SCALE_FACTOR).astype(np.uint16)
146
+ # imageio.imwrite(os.path.join(final_folder, 'depth', f"{pair[1]}.png"), depth_array_aligned)
147
+ np.save(
148
+ os.path.join(final_folder, "depth", f"{pair[0]}.npy"), depth_array_aligned
149
+ )
150
+ np.savez(
151
+ os.path.join(final_folder, "cam", f"{pair[0]}.npz"), intrinsics=intrinsics
152
+ )
153
+ except Exception as e:
154
+ return f"Error processing pair {pair}: {e}"
155
+ return None
156
+
157
+
158
+ def main():
159
+ DATA_DIR_ = "data_smartportraits/SmartPortraits" # REPLACE WITH YOUR OWN DATA PATH!
160
+ DATA_DIR = DATA_DIR_.rstrip("/")
161
+ print(f"{DATA_DIR_} {DATA_DIR}/")
162
+
163
+ # Folder where the data in TUM format will be put
164
+ curr_dir = os.path.dirname(os.path.abspath(__file__))
165
+ with open(os.path.join(curr_dir, "config.json")) as conf_f:
166
+ config = json.load(conf_f)
167
+
168
+ # Pre-load shared data
169
+ with open(os.path.join(curr_dir, config["depth_conf"]), "rb") as config_f:
170
+ config_dict = pickle.load(config_f)
171
+
172
+ rgb_cnf = np.load(
173
+ os.path.join(curr_dir, config["rgb_intristics"]), allow_pickle=True
174
+ ).item()
175
+
176
+ T = np.load(os.path.join(curr_dir, config["transform_intristics"]))
177
+
178
+ final_root = "processed_smartportraits1" # REPLACE WITH YOUR OWN DATA PATH!
179
+
180
+ seqs = []
181
+ for scene in os.listdir(DATA_DIR):
182
+ scene_path = os.path.join(DATA_DIR, scene)
183
+ if not os.path.isdir(scene_path):
184
+ continue
185
+ for s in os.listdir(scene_path):
186
+ s_path = os.path.join(scene_path, s)
187
+ if not os.path.isdir(s_path):
188
+ continue
189
+ for date in os.listdir(s_path):
190
+ date_path = os.path.join(s_path, date)
191
+ if os.path.isdir(date_path):
192
+ seqs.append((scene, s, date))
193
+
194
+ for seq in tqdm(seqs):
195
+ scene, s, date = seq
196
+ dataset_path = os.path.join(DATA_DIR, scene, s, date)
197
+ final_folder = os.path.join(final_root, "_".join([scene, s, date]))
198
+
199
+ azure_depth_folder = os.path.join(dataset_path, "_azure_depth_image_raw")
200
+ smartphone_folder = os.path.join(dataset_path, "smartphone_video_frames")
201
+
202
+ depth_files = [
203
+ file for file in os.listdir(azure_depth_folder) if file.endswith(".npy")
204
+ ]
205
+ depth_ts = np.array([int(file.split(".")[0]) for file in depth_files])
206
+ depth_ts.sort()
207
+
208
+ rgb_files = [
209
+ file for file in os.listdir(smartphone_folder) if file.endswith(".png")
210
+ ]
211
+ rgb_ts = np.array([int(file.split(".")[0]) for file in rgb_files])
212
+ rgb_ts.sort()
213
+
214
+ print(
215
+ f"Depth timestamps from {depth_ts[0]} to {depth_ts[-1]} (cnt {len(depth_ts)})"
216
+ )
217
+ print(f"RGB timestamps from {rgb_ts[0]} to {rgb_ts[-1]} (cnt {len(rgb_ts)})")
218
+
219
+ # Build correspondences between depth and rgb by nearest neighbour algorithm
220
+ rgbd_pairs = []
221
+ for depth_t in depth_ts:
222
+ idx = np.argmin(np.abs(rgb_ts - depth_t))
223
+ closest_rgb_t = rgb_ts[idx]
224
+ rgbd_pairs.append((closest_rgb_t, depth_t))
225
+
226
+ # Prepare folder infrastructure
227
+ if os.path.exists(final_folder):
228
+ shutil.rmtree(final_folder)
229
+ os.makedirs(os.path.join(final_folder, "depth"), exist_ok=True)
230
+ os.makedirs(os.path.join(final_folder, "rgb"), exist_ok=True)
231
+ os.makedirs(os.path.join(final_folder, "cam"), exist_ok=True)
232
+
233
+ # Prepare arguments for processing
234
+ tasks = [
235
+ (
236
+ pair,
237
+ smartphone_folder,
238
+ azure_depth_folder,
239
+ final_folder,
240
+ config,
241
+ rgb_cnf,
242
+ config_dict,
243
+ T,
244
+ )
245
+ for pair in rgbd_pairs
246
+ ]
247
+
248
+ num_workers = os.cpu_count()
249
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
250
+ futures = {executor.submit(process_pair, task): task[0] for task in tasks}
251
+ for future in tqdm(
252
+ as_completed(futures),
253
+ total=len(futures),
254
+ desc=f"Processing {scene}_{s}_{date}",
255
+ ):
256
+ error = future.result()
257
+ if error:
258
+ print(error)
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()
extern/CUT3R/datasets_preprocess/flow_IO.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import struct
2
+ import numpy as np
3
+ import png
4
+ import re
5
+ import sys
6
+ import csv
7
+ from PIL import Image
8
+ import h5py
9
+
10
+
11
+ FLO_TAG_FLOAT = (
12
+ 202021.25 # first 4 bytes in flo file; check for this when READING the file
13
+ )
14
+ FLO_TAG_STRING = "PIEH" # first 4 bytes in flo file; use this when WRITING the file
15
+ FLO_UNKNOWN_FLOW_THRESH = 1e9 # flo format threshold for unknown values
16
+ FLO_UNKNOWN_FLOW = 1e10 # value to use to represent unknown flow in flo file format
17
+
18
+
19
+ def readFlowFile(filepath):
20
+ """read flow files in several formats. The resulting flow has shape height x width x 2.
21
+ For positions where there is no groundtruth available, the flow is set to np.nan.
22
+ Supports flo (Sintel), png (KITTI), npy (numpy), pfm (FlyingThings3D) and flo5 (Spring) file format.
23
+ filepath: path to the flow file
24
+ returns: flow with shape height x width x 2
25
+ """
26
+ if filepath.endswith(".flo"):
27
+ return readFloFlow(filepath)
28
+ elif filepath.endswith(".png"):
29
+ return readPngFlow(filepath)
30
+ elif filepath.endswith(".npy"):
31
+ return readNpyFlow(filepath)
32
+ elif filepath.endswith(".pfm"):
33
+ return readPfmFlow(filepath)
34
+ elif filepath.endswith(".flo5"):
35
+ return readFlo5Flow(filepath)
36
+ else:
37
+ raise ValueError(f"readFlowFile: Unknown file format for {filepath}")
38
+
39
+
40
+ def writeFlowFile(flow, filepath):
41
+ """write optical flow to file. Supports flo (Sintel), png (KITTI) and npy (numpy) file format.
42
+ flow: optical flow with shape height x width x 2. Invalid values should be represented as np.nan
43
+ filepath: file path where to write the flow
44
+ """
45
+ if not filepath:
46
+ raise ValueError("writeFlowFile: empty filepath")
47
+
48
+ if len(flow.shape) != 3 or flow.shape[2] != 2:
49
+ raise IOError(
50
+ f"writeFlowFile {filepath}: expected shape height x width x 2 but received {flow.shape}"
51
+ )
52
+
53
+ if flow.shape[0] > flow.shape[1]:
54
+ print(
55
+ f"write flo file {filepath}: Warning: Are you writing an upright image? Expected shape height x width x 2, got {flow.shape}"
56
+ )
57
+
58
+ if filepath.endswith(".flo"):
59
+ return writeFloFlow(flow, filepath)
60
+ elif filepath.endswith(".png"):
61
+ return writePngFlow(flow, filepath)
62
+ elif filepath.endswith(".npy"):
63
+ return writeNpyFile(flow, filepath)
64
+ elif filepath.endswith(".flo5"):
65
+ return writeFlo5File(flow, filepath)
66
+ else:
67
+ raise ValueError(f"writeFlowFile: Unknown file format for {filepath}")
68
+
69
+
70
+ def readFloFlow(filepath):
71
+ """read optical flow from file stored in .flo file format as used in the Sintel dataset (Butler et al., 2012)
72
+ filepath: path to file where to read from
73
+ returns: flow as a numpy array with shape height x width x 2
74
+ ---
75
+ ".flo" file format used for optical flow evaluation
76
+
77
+ Stores 2-band float image for horizontal (u) and vertical (v) flow components.
78
+ Floats are stored in little-endian order.
79
+ A flow value is considered "unknown" if either |u| or |v| is greater than 1e9.
80
+
81
+ bytes contents
82
+
83
+ 0-3 tag: "PIEH" in ASCII, which in little endian happens to be the float 202021.25
84
+ (just a sanity check that floats are represented correctly)
85
+ 4-7 width as an integer
86
+ 8-11 height as an integer
87
+ 12-end data (width*height*2*4 bytes total)
88
+ the float values for u and v, interleaved, in row order, i.e.,
89
+ u[row0,col0], v[row0,col0], u[row0,col1], v[row0,col1], ...
90
+ """
91
+ if filepath is None:
92
+ raise IOError("read flo file: empty filename")
93
+
94
+ if not filepath.endswith(".flo"):
95
+ raise IOError(f"read flo file ({filepath}): extension .flo expected")
96
+
97
+ with open(filepath, "rb") as stream:
98
+ tag = struct.unpack("f", stream.read(4))[0]
99
+ width = struct.unpack("i", stream.read(4))[0]
100
+ height = struct.unpack("i", stream.read(4))[0]
101
+
102
+ if tag != FLO_TAG_FLOAT: # simple test for correct endian-ness
103
+ raise IOError(
104
+ f"read flo file({filepath}): wrong tag (possibly due to big-endian machine?)"
105
+ )
106
+
107
+ # another sanity check to see that integers were read correctly (99999 should do the trick...)
108
+ if width < 1 or width > 99999:
109
+ raise IOError(f"read flo file({filepath}): illegal width {width}")
110
+
111
+ if height < 1 or height > 99999:
112
+ raise IOError(f"read flo file({filepath}): illegal height {height}")
113
+
114
+ nBands = 2
115
+ flow = []
116
+
117
+ n = nBands * width
118
+ for _ in range(height):
119
+ data = stream.read(n * 4)
120
+ if data is None:
121
+ raise IOError(f"read flo file({filepath}): file is too short")
122
+ data = np.asarray(struct.unpack(f"{n}f", data))
123
+ data = data.reshape((width, nBands))
124
+ flow.append(data)
125
+
126
+ if stream.read(1) != b"":
127
+ raise IOError(f"read flo file({filepath}): file is too long")
128
+
129
+ flow = np.asarray(flow)
130
+ # unknown values are set to nan
131
+ flow[np.abs(flow) > FLO_UNKNOWN_FLOW_THRESH] = np.nan
132
+
133
+ return flow
134
+
135
+
136
+ def writeFloFlow(flow, filepath):
137
+ """
138
+ write optical flow in .flo format to file as used in the Sintel dataset (Butler et al., 2012)
139
+ flow: optical flow with shape height x width x 2
140
+ filepath: optical flow file path to be saved
141
+ ---
142
+ ".flo" file format used for optical flow evaluation
143
+
144
+ Stores 2-band float image for horizontal (u) and vertical (v) flow components.
145
+ Floats are stored in little-endian order.
146
+ A flow value is considered "unknown" if either |u| or |v| is greater than 1e9.
147
+
148
+ bytes contents
149
+
150
+ 0-3 tag: "PIEH" in ASCII, which in little endian happens to be the float 202021.25
151
+ (just a sanity check that floats are represented correctly)
152
+ 4-7 width as an integer
153
+ 8-11 height as an integer
154
+ 12-end data (width*height*2*4 bytes total)
155
+ the float values for u and v, interleaved, in row order, i.e.,
156
+ u[row0,col0], v[row0,col0], u[row0,col1], v[row0,col1], ...
157
+ """
158
+
159
+ height, width, nBands = flow.shape
160
+
161
+ with open(filepath, "wb") as f:
162
+ if f is None:
163
+ raise IOError(f"write flo file {filepath}: file could not be opened")
164
+
165
+ # write header
166
+ result = f.write(FLO_TAG_STRING.encode("ascii"))
167
+ result += f.write(struct.pack("i", width))
168
+ result += f.write(struct.pack("i", height))
169
+ if result != 12:
170
+ raise IOError(f"write flo file {filepath}: problem writing header")
171
+
172
+ # write content
173
+ n = nBands * width
174
+ for i in range(height):
175
+ data = flow[i, :, :].flatten()
176
+ data[np.isnan(data)] = FLO_UNKNOWN_FLOW
177
+ result = f.write(struct.pack(f"{n}f", *data))
178
+ if result != n * 4:
179
+ raise IOError(f"write flo file {filepath}: problem writing row {i}")
180
+
181
+
182
+ def readPngFlow(filepath):
183
+ """read optical flow from file stored in png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
184
+ filepath: path to file where to read from
185
+ returns: flow as a numpy array with shape height x width x 2. Invalid values are represented as np.nan
186
+ """
187
+ # adapted from https://github.com/liruoteng/OpticalFlowToolkit
188
+ flow_object = png.Reader(filename=filepath)
189
+ flow_direct = flow_object.asDirect()
190
+ flow_data = list(flow_direct[2])
191
+ (w, h) = flow_direct[3]["size"]
192
+ flow = np.zeros((h, w, 3), dtype=np.float64)
193
+ for i in range(len(flow_data)):
194
+ flow[i, :, 0] = flow_data[i][0::3]
195
+ flow[i, :, 1] = flow_data[i][1::3]
196
+ flow[i, :, 2] = flow_data[i][2::3]
197
+
198
+ invalid_idx = flow[:, :, 2] == 0
199
+ flow[:, :, 0:2] = (flow[:, :, 0:2] - 2**15) / 64.0
200
+ flow[invalid_idx, 0] = np.nan
201
+ flow[invalid_idx, 1] = np.nan
202
+ return flow[:, :, :2]
203
+
204
+
205
+ def writePngFlow(flow, filename):
206
+ """write optical flow to file png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
207
+ flow: optical flow in shape height x width x 2, invalid values should be represented as np.nan
208
+ filepath: path to file where to write to
209
+ """
210
+ flow = 64.0 * flow + 2**15
211
+ width = flow.shape[1]
212
+ height = flow.shape[0]
213
+ valid_map = np.ones([flow.shape[0], flow.shape[1], 1])
214
+ valid_map[np.isnan(flow[:, :, 0]) | np.isnan(flow[:, :, 1])] = 0
215
+ flow = np.nan_to_num(flow)
216
+ flow = np.concatenate([flow, valid_map], axis=-1)
217
+ flow = np.clip(flow, 0, 2**16 - 1)
218
+ flow = flow.astype(np.uint16)
219
+ flow = np.reshape(flow, (-1, width * 3))
220
+ with open(filename, "wb") as f:
221
+ writer = png.Writer(width=width, height=height, bitdepth=16, greyscale=False)
222
+ writer.write(f, flow)
223
+
224
+
225
+ def readNpyFlow(filepath):
226
+ """read numpy array from file.
227
+ filepath: file to read from
228
+ returns: numpy array
229
+ """
230
+ return np.load(filepath)
231
+
232
+
233
+ def writeNpyFile(arr, filepath):
234
+ """write numpy array to file.
235
+ arr: numpy array to write
236
+ filepath: file to write to
237
+ """
238
+ np.save(filepath, arr)
239
+
240
+
241
+ def writeFlo5File(flow, filename):
242
+ with h5py.File(filename, "w") as f:
243
+ f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5)
244
+
245
+
246
+ def readFlo5Flow(filename):
247
+ with h5py.File(filename, "r") as f:
248
+ if "flow" not in f.keys():
249
+ raise IOError(
250
+ f"File {filename} does not have a 'flow' key. Is this a valid flo5 file?"
251
+ )
252
+ return f["flow"][()]
253
+
254
+
255
+ def readPfmFlow(filepath):
256
+ """read optical flow from file stored in pfm file format as used in the FlyingThings3D (Mayer et al., 2016) dataset.
257
+ filepath: path to file where to read from
258
+ returns: flow as a numpy array with shape height x width x 2.
259
+ """
260
+ flow = readPfmFile(filepath)
261
+ if len(flow.shape) != 3:
262
+ raise IOError(
263
+ f"read pfm flow: PFM file has wrong shape (assumed to be w x h x 3): {flow.shape}"
264
+ )
265
+ if flow.shape[2] != 3:
266
+ raise IOError(
267
+ f"read pfm flow: PFM file has wrong shape (assumed to be w x h x 3): {flow.shape}"
268
+ )
269
+ # remove third channel -> is all zeros
270
+ return flow[:, :, :2]
271
+
272
+
273
+ def readPfmFile(filepath):
274
+ """
275
+ adapted from https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html
276
+ """
277
+ file = open(filepath, "rb")
278
+
279
+ color = None
280
+ width = None
281
+ height = None
282
+ scale = None
283
+ endian = None
284
+
285
+ header = file.readline().rstrip()
286
+ if header.decode("ascii") == "PF":
287
+ color = True
288
+ elif header.decode("ascii") == "Pf":
289
+ color = False
290
+ else:
291
+ raise Exception("Not a PFM file.")
292
+
293
+ dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
294
+ if dim_match:
295
+ width, height = list(map(int, dim_match.groups()))
296
+ else:
297
+ raise Exception("Malformed PFM header.")
298
+
299
+ scale = float(file.readline().decode("ascii").rstrip())
300
+ if scale < 0: # little-endian
301
+ endian = "<"
302
+ scale = -scale
303
+ else:
304
+ endian = ">" # big-endian
305
+
306
+ data = np.fromfile(file, endian + "f")
307
+ shape = (height, width, 3) if color else (height, width)
308
+
309
+ data = np.reshape(data, shape)
310
+ data = np.flipud(data)
311
+ return data # , scale
312
+
313
+
314
+ def writePfmFile(image, filepath):
315
+ """
316
+ adapted from https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html
317
+ """
318
+ scale = 1
319
+ file = open(filepath, "wb")
320
+
321
+ color = None
322
+
323
+ if image.dtype.name != "float32":
324
+ raise Exception("Image dtype must be float32.")
325
+
326
+ image = np.flipud(image)
327
+
328
+ if len(image.shape) == 3 and image.shape[2] == 3: # color image
329
+ color = True
330
+ elif (
331
+ len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
332
+ ): # greyscale
333
+ color = False
334
+ else:
335
+ raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
336
+
337
+ file.write("PF\n" if color else "Pf\n".encode())
338
+ file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
339
+
340
+ endian = image.dtype.byteorder
341
+
342
+ if endian == "<" or endian == "=" and sys.byteorder == "little":
343
+ scale = -scale
344
+
345
+ file.write("%f\n".encode() % scale)
346
+
347
+ image.tofile(file)
348
+
349
+
350
+ def readDispFile(filepath):
351
+ """read disparity (or disparity change) from file. The resulting numpy array has shape height x width.
352
+ For positions where there is no groundtruth available, the value is set to np.nan.
353
+ Supports png (KITTI), npy (numpy) and pfm (FlyingThings3D) file format.
354
+ filepath: path to the flow file
355
+ returns: disparity with shape height x width
356
+ """
357
+ if filepath.endswith(".png"):
358
+ return readPngDisp(filepath)
359
+ elif filepath.endswith(".npy"):
360
+ return readNpyFlow(filepath)
361
+ elif filepath.endswith(".pfm"):
362
+ return readPfmDisp(filepath)
363
+ elif filepath.endswith(".dsp5"):
364
+ return readDsp5Disp(filepath)
365
+ else:
366
+ raise ValueError(f"readDispFile: Unknown file format for {filepath}")
367
+
368
+
369
+ def readPngDisp(filepath):
370
+ """read disparity from file stored in png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
371
+ filepath: path to file where to read from
372
+ returns: disparity as a numpy array with shape height x width. Invalid values are represented as np.nan
373
+ """
374
+ # adapted from https://github.com/liruoteng/OpticalFlowToolkit
375
+ image_object = png.Reader(filename=filepath)
376
+ image_direct = image_object.asDirect()
377
+ image_data = list(image_direct[2])
378
+ (w, h) = image_direct[3]["size"]
379
+ channel = len(image_data[0]) // w
380
+ if channel != 1:
381
+ raise IOError("read png disp: assumed channels to be 1!")
382
+ disp = np.zeros((h, w), dtype=np.float64)
383
+ for i in range(len(image_data)):
384
+ disp[i, :] = image_data[i][:]
385
+ disp[disp == 0] = np.nan
386
+ return disp[:, :] / 256.0
387
+
388
+
389
+ def readPfmDisp(filepath):
390
+ """read disparity or disparity change from file stored in pfm file format as used in the FlyingThings3D (Mayer et al., 2016) dataset.
391
+ filepath: path to file where to read from
392
+ returns: disparity as a numpy array with shape height x width. Invalid values are represented as np.nan
393
+ """
394
+ disp = readPfmFile(filepath)
395
+ if len(disp.shape) != 2:
396
+ raise IOError(
397
+ f"read pfm disp: PFM file has wrong shape (assumed to be w x h): {disp.shape}"
398
+ )
399
+ return disp
400
+
401
+
402
+ def writePngDisp(disp, filepath):
403
+ """write disparity to png file format as used in the KITTI 12 (Geiger et al., 2012) and KITTI 15 (Menze et al., 2015) dataset.
404
+ disp: disparity in shape height x width, invalid values should be represented as np.nan
405
+ filepath: path to file where to write to
406
+ """
407
+ disp = 256 * disp
408
+ width = disp.shape[1]
409
+ height = disp.shape[0]
410
+ disp = np.clip(disp, 0, 2**16 - 1)
411
+ disp = np.nan_to_num(disp).astype(np.uint16)
412
+ disp = np.reshape(disp, (-1, width))
413
+ with open(filepath, "wb") as f:
414
+ writer = png.Writer(width=width, height=height, bitdepth=16, greyscale=True)
415
+ writer.write(f, disp)
416
+
417
+
418
+ def writeDsp5File(disp, filename):
419
+ with h5py.File(filename, "w") as f:
420
+ f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5)
421
+
422
+
423
+ def readDsp5Disp(filename):
424
+ with h5py.File(filename, "r") as f:
425
+ if "disparity" not in f.keys():
426
+ raise IOError(
427
+ f"File {filename} does not have a 'disparity' key. Is this a valid dsp5 file?"
428
+ )
429
+ return f["disparity"][()]
430
+
431
+
432
+ def writeDispFile(disp, filepath):
433
+ """write disparity to file. Supports png (KITTI) and npy (numpy) file format.
434
+ disp: disparity with shape height x width. Invalid values should be represented as np.nan
435
+ filepath: file path where to write the flow
436
+ """
437
+ if not filepath:
438
+ raise ValueError("writeDispFile: empty filepath")
439
+
440
+ if len(disp.shape) != 2:
441
+ raise IOError(
442
+ f"writeDispFile {filepath}: expected shape height x width but received {disp.shape}"
443
+ )
444
+
445
+ if disp.shape[0] > disp.shape[1]:
446
+ print(
447
+ f"writeDispFile {filepath}: Warning: Are you writing an upright image? Expected shape height x width, got {disp.shape}"
448
+ )
449
+
450
+ if filepath.endswith(".png"):
451
+ writePngDisp(disp, filepath)
452
+ elif filepath.endswith(".npy"):
453
+ writeNpyFile(disp, filepath)
454
+ elif filepath.endswith(".dsp5"):
455
+ writeDsp5File(disp, filepath)
456
+
457
+
458
+ def readKITTIObjMap(filepath):
459
+ assert filepath.endswith(".png")
460
+ return np.asarray(Image.open(filepath)) > 0
461
+
462
+
463
+ def readKITTIIntrinsics(filepath, image=2):
464
+ assert filepath.endswith(".txt")
465
+
466
+ with open(filepath) as f:
467
+ reader = csv.reader(f, delimiter=" ")
468
+ for row in reader:
469
+ if row[0] == f"K_{image:02d}:":
470
+ K = np.array(row[1:], dtype=np.float32).reshape(3, 3)
471
+ kvec = np.array([K[0, 0], K[1, 1], K[0, 2], K[1, 2]])
472
+ return kvec
473
+
474
+
475
+ def writePngMapFile(map_, filename):
476
+ Image.fromarray(map_).save(filename)
extern/CUT3R/datasets_preprocess/generate_set_arkitscenes.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess scenes by sorting images and generating image/video collections.
4
+
5
+ This script processes scenes in parallel using a thread pool, updating metadata
6
+ with sorted images, trajectories, intrinsics, and generating pair, image collection,
7
+ and video collection data. The processed metadata is saved to a new file in each scene directory.
8
+
9
+ Usage:
10
+ python generate_set_arkitscenes.py --root /path/to/data --splits Training Test --max_interval 5.0 --num_workers 8
11
+ """
12
+
13
+ import os
14
+ import os.path as osp
15
+ import argparse
16
+ import numpy as np
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from tqdm import tqdm
19
+
20
+
21
+ def get_timestamp(img_name):
22
+ """
23
+ Extract the timestamp from an image filename.
24
+ Assumes the timestamp is the last underscore-separated token in the name (before the file extension).
25
+
26
+ Args:
27
+ img_name (str): The image filename.
28
+
29
+ Returns:
30
+ float: The extracted timestamp.
31
+ """
32
+ return float(img_name[:-4].split("_")[-1])
33
+
34
+
35
+ def process_scene(root, split, scene, max_interval):
36
+ """
37
+ Process a single scene by sorting its images by timestamp, updating trajectories,
38
+ intrinsics, and pairings, and generating image/video collections.
39
+
40
+ Args:
41
+ root (str): Root directory of the dataset.
42
+ split (str): The dataset split (e.g., 'Training', 'Test').
43
+ scene (str): The scene identifier.
44
+ max_interval (float): Maximum allowed time interval (in seconds) between images to consider them in the same video collection.
45
+ """
46
+ scene_dir = osp.join(root, split, scene)
47
+ metadata_path = osp.join(scene_dir, "scene_metadata.npz")
48
+
49
+ # Load the scene metadata
50
+ with np.load(metadata_path) as data:
51
+ images = data["images"]
52
+ trajectories = data["trajectories"]
53
+ intrinsics = data["intrinsics"]
54
+ pairs = data["pairs"]
55
+
56
+ # Sort images by timestep
57
+ imgs_with_indices = sorted(enumerate(images), key=lambda x: x[1])
58
+ indices, images = zip(*imgs_with_indices)
59
+ indices = np.array(indices)
60
+ index2sorted = {index: i for i, index in enumerate(indices)}
61
+
62
+ # Reorder trajectories and intrinsics based on the new image order
63
+ trajectories = trajectories[indices]
64
+ intrinsics = intrinsics[indices]
65
+
66
+ # Update pair indices (each pair is (id1, id2, score))
67
+ pairs = [(index2sorted[id1], index2sorted[id2], score) for id1, id2, score in pairs]
68
+
69
+ # Form image_collection: mapping from an image id to a list of (other image id, score)
70
+ image_collection = {}
71
+ for id1, id2, score in pairs:
72
+ image_collection.setdefault(id1, []).append((id2, score))
73
+
74
+ # Form video_collection: for each image, collect subsequent images within the max_interval time window
75
+ video_collection = {}
76
+ for i, image in enumerate(images):
77
+ j = i + 1
78
+ for j in range(i + 1, len(images)):
79
+ if get_timestamp(images[j]) - get_timestamp(image) > max_interval:
80
+ break
81
+ video_collection[i] = list(range(i + 1, j))
82
+
83
+ # Save the new metadata
84
+ output_path = osp.join(scene_dir, "new_scene_metadata.npz")
85
+ np.savez(
86
+ output_path,
87
+ images=images,
88
+ trajectories=trajectories,
89
+ intrinsics=intrinsics,
90
+ pairs=pairs,
91
+ image_collection=image_collection,
92
+ video_collection=video_collection,
93
+ )
94
+ print(f"Processed scene: {scene}")
95
+
96
+
97
+ def main(args):
98
+ """
99
+ Main function to process scenes across specified dataset splits in parallel.
100
+ """
101
+ root = args.root
102
+ splits = args.splits
103
+ max_interval = args.max_interval
104
+ num_workers = args.num_workers
105
+
106
+ futures = []
107
+
108
+ # Create a ThreadPoolExecutor for parallel processing
109
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
110
+ for split in splits:
111
+ all_meta_path = osp.join(root, split, "all_metadata.npz")
112
+ with np.load(all_meta_path) as data:
113
+ scenes = data["scenes"]
114
+
115
+ # Submit processing tasks for each scene in the current split
116
+ for scene in scenes:
117
+ futures.append(
118
+ executor.submit(process_scene, root, split, scene, max_interval)
119
+ )
120
+
121
+ # Use tqdm to display a progress bar as futures complete
122
+ for future in tqdm(
123
+ as_completed(futures), total=len(futures), desc="Processing scenes"
124
+ ):
125
+ # This will raise any exceptions caught during scene processing.
126
+ future.result()
127
+
128
+
129
+ if __name__ == "__main__":
130
+ parser = argparse.ArgumentParser(
131
+ description="Preprocess scene data to update metadata with sorted images and collections."
132
+ )
133
+ parser.add_argument(
134
+ "--root",
135
+ type=str,
136
+ default="",
137
+ help="Root directory containing the dataset splits.",
138
+ )
139
+ parser.add_argument(
140
+ "--splits",
141
+ type=str,
142
+ nargs="+",
143
+ default=["Training", "Test"],
144
+ help="List of dataset splits to process (e.g., Training Test).",
145
+ )
146
+ parser.add_argument(
147
+ "--max_interval",
148
+ type=float,
149
+ default=5.0,
150
+ help="Maximum time interval (in seconds) between images to consider them in the same video sequence.",
151
+ )
152
+ parser.add_argument(
153
+ "--num_workers",
154
+ type=int,
155
+ default=8,
156
+ help="Number of worker threads for parallel processing.",
157
+ )
158
+ args = parser.parse_args()
159
+ main(args)
extern/CUT3R/datasets_preprocess/generate_set_scannet.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess ScanNet scenes to generate video collections.
4
+
5
+ This script processes each scene in specified splits by reading the image filenames
6
+ from the "color" folder, grouping images into video sequences based on a maximum
7
+ timestamp interval, and then saving the per-scene metadata as a NumPy .npz file.
8
+
9
+ Usage:
10
+ python generate_set_scannet.py --root /path/to/processed_scannet \
11
+ --splits scans_test scans_train --max_interval 150 --num_workers 8
12
+ """
13
+
14
+ import os
15
+ import os.path as osp
16
+ import argparse
17
+ import numpy as np
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from tqdm import tqdm
20
+
21
+
22
+ def get_timestamp(img_name):
23
+ """
24
+ Convert an image basename to an integer timestamp.
25
+
26
+ For ScanNet data, it is assumed that the basename is an integer string.
27
+
28
+ Args:
29
+ img_name (str): Image basename (without extension).
30
+
31
+ Returns:
32
+ int: The timestamp as an integer.
33
+ """
34
+ return int(img_name)
35
+
36
+
37
+ def process_scene(root, split, scene, max_interval):
38
+ """
39
+ Process a single scene: group images into video sequences and save metadata.
40
+
41
+ Args:
42
+ root (str): Root directory for the processed data.
43
+ split (str): Name of the split (e.g., 'scans_test', 'scans_train').
44
+ scene (str): Name of the scene directory.
45
+ max_interval (int): Maximum allowed difference in timestamps for grouping images.
46
+ """
47
+ scene_dir = osp.join(root, split, scene)
48
+ color_dir = osp.join(scene_dir, "color")
49
+ # depth_dir and camera_dir are defined in case you need them in future modifications.
50
+ # depth_dir = osp.join(scene_dir, 'depth')
51
+ # camera_dir = osp.join(scene_dir, 'cam')
52
+
53
+ # Get all image basenames from the color folder (without file extension)
54
+ basenames = sorted(
55
+ [f.split(".")[0] for f in os.listdir(color_dir) if f.endswith(".jpg")],
56
+ key=lambda x: get_timestamp(x),
57
+ )
58
+
59
+ video_collection = {}
60
+ for i, image in enumerate(basenames):
61
+ video_collection[i] = []
62
+ for j in range(i + 1, len(basenames)):
63
+ # Group images that fall within max_interval seconds of the reference image.
64
+ if get_timestamp(basenames[j]) - get_timestamp(image) > max_interval:
65
+ break
66
+ video_collection[i].append(j)
67
+
68
+ # Save the scene metadata (list of basenames and the video collection) to an NPZ file.
69
+ out_path = osp.join(scene_dir, "new_scene_metadata.npz")
70
+ np.savez(out_path, images=basenames, video_collection=video_collection)
71
+ print(f"Processed scene: {scene} (split: {split})")
72
+
73
+
74
+ def main(args):
75
+ root = args.root
76
+ splits = args.splits
77
+ max_interval = args.max_interval
78
+ num_workers = args.num_workers
79
+
80
+ futures = []
81
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
82
+ for split in splits:
83
+ split_dir = osp.join(root, split)
84
+ if not osp.isdir(split_dir):
85
+ print(
86
+ f"Warning: Split directory '{split_dir}' does not exist; skipping."
87
+ )
88
+ continue
89
+ scenes = os.listdir(split_dir)
90
+ for scene in scenes:
91
+ futures.append(
92
+ executor.submit(process_scene, root, split, scene, max_interval)
93
+ )
94
+ # Use tqdm to display progress as futures complete.
95
+ for future in tqdm(
96
+ as_completed(futures), total=len(futures), desc="Processing scenes"
97
+ ):
98
+ # This will re-raise any exceptions from process_scene.
99
+ future.result()
100
+
101
+
102
+ if __name__ == "__main__":
103
+ parser = argparse.ArgumentParser(
104
+ description="Preprocess ScanNet scenes to create video collections based on image timestamps."
105
+ )
106
+ parser.add_argument(
107
+ "--root",
108
+ type=str,
109
+ default="",
110
+ help="Root directory containing the processed ScanNet splits.",
111
+ )
112
+ parser.add_argument(
113
+ "--splits",
114
+ type=str,
115
+ nargs="+",
116
+ default=["scans_test", "scans_train"],
117
+ help="List of split directories to process (e.g., scans_test scans_train).",
118
+ )
119
+ parser.add_argument(
120
+ "--max_interval",
121
+ type=int,
122
+ default=150,
123
+ help="Maximum allowed timestamp difference (in integer units) for grouping images.",
124
+ )
125
+ parser.add_argument(
126
+ "--num_workers",
127
+ type=int,
128
+ default=8,
129
+ help="Number of worker threads for parallel processing.",
130
+ )
131
+ args = parser.parse_args()
132
+ main(args)
extern/CUT3R/datasets_preprocess/generate_set_scannetpp.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess processed_scannetpp scenes to update scene metadata.
4
+
5
+ This script reads each scene's "scene_metadata.npz", sorts images by timestamp,
6
+ updates trajectories, intrinsics, and pair indices, and builds two collections:
7
+ - image_collection: For each image, stores pairs (other image index, score)
8
+ - video_collection: For each image, groups subsequent images whose timestamps
9
+ differ by at most a given max_interval (and share the same
10
+ first character in the image name).
11
+
12
+ The new metadata is saved as "new_scene_metadata.npz" in each scene folder.
13
+
14
+ Usage:
15
+ python generate_set_scannetpp.py --root /path/to/processed_scannetpp \
16
+ --max_interval 150 --num_workers 8
17
+ """
18
+
19
+ import os
20
+ import os.path as osp
21
+ import argparse
22
+ import numpy as np
23
+ from concurrent.futures import ThreadPoolExecutor, as_completed
24
+ from tqdm import tqdm
25
+
26
+
27
+ def get_timestamp(img_name):
28
+ """
29
+ Convert an image name to a timestamp (integer).
30
+
31
+ If the image name starts with 'DSC', the timestamp is the integer part after 'DSC'.
32
+ Otherwise, it is assumed the image name has an underscore, and the second element is used.
33
+
34
+ Args:
35
+ img_name (str): The image basename (without extension).
36
+
37
+ Returns:
38
+ int: The extracted timestamp.
39
+ """
40
+ if img_name.startswith("DSC"):
41
+ return int(img_name[3:])
42
+ else:
43
+ return int(img_name.split("_")[1])
44
+
45
+
46
+ def process_scene(root, scene, max_interval):
47
+ """
48
+ Process a single scene: sort images, update trajectories/intrinsics/pairs, and
49
+ form image and video collections. Save the updated metadata.
50
+
51
+ Args:
52
+ root (str): Root directory containing scene folders.
53
+ scene (str): Scene folder name.
54
+ max_interval (int): Maximum allowed difference (in timestamp units) for video grouping.
55
+ """
56
+ scene_dir = osp.join(root, scene)
57
+ metadata_path = osp.join(scene_dir, "scene_metadata.npz")
58
+ with np.load(metadata_path, allow_pickle=True) as data:
59
+ images = data["images"]
60
+ trajectories = data["trajectories"]
61
+ intrinsics = data["intrinsics"]
62
+ pairs = data["pairs"]
63
+
64
+ # Sort images by timestamp.
65
+ imgs_with_indices = sorted(enumerate(images), key=lambda x: x[1])
66
+ indices, images = zip(*imgs_with_indices)
67
+ indices = np.array(indices)
68
+ index2sorted = {index: i for i, index in enumerate(indices)}
69
+
70
+ # Update trajectories and intrinsics arrays according to the new order.
71
+ trajectories = trajectories[indices]
72
+ intrinsics = intrinsics[indices]
73
+
74
+ # Update pairs (each pair is (id1, id2, score)) with new indices.
75
+ pairs = [(index2sorted[id1], index2sorted[id2], score) for id1, id2, score in pairs]
76
+
77
+ # Build image_collection: for each pair, verify that both image files exist.
78
+ image_collection = {}
79
+ for id1, id2, score in pairs:
80
+ img1 = images[id1]
81
+ img2 = images[id2]
82
+ img1_path = osp.join(scene_dir, "images", img1 + ".jpg")
83
+ img2_path = osp.join(scene_dir, "images", img2 + ".jpg")
84
+ if not (osp.exists(img1_path) and osp.exists(img2_path)):
85
+ continue
86
+ if id1 not in image_collection:
87
+ image_collection[id1] = []
88
+ image_collection[id1].append((id2, score))
89
+
90
+ # Build video_collection: for each image, group subsequent images if:
91
+ # 1. Their timestamp difference is at most max_interval.
92
+ # 2. Their name's first character is the same as the current image.
93
+ video_collection = {}
94
+ for i, image in enumerate(images):
95
+ img_path = osp.join(scene_dir, "images", image + ".jpg")
96
+ if not osp.exists(img_path):
97
+ continue
98
+ video_collection[i] = []
99
+ for j in range(i + 1, len(images)):
100
+ next_img_path = osp.join(scene_dir, "images", images[j] + ".jpg")
101
+ if not osp.exists(next_img_path):
102
+ continue
103
+ if (
104
+ get_timestamp(images[j]) - get_timestamp(image) > max_interval
105
+ or images[j][0] != image[0]
106
+ ):
107
+ break
108
+ video_collection[i].append(j)
109
+
110
+ # Save the updated metadata to a new file.
111
+ out_path = osp.join(scene_dir, "new_scene_metadata.npz")
112
+ np.savez(
113
+ out_path,
114
+ images=images,
115
+ trajectories=trajectories,
116
+ intrinsics=intrinsics,
117
+ pairs=pairs,
118
+ image_collection=image_collection,
119
+ video_collection=video_collection,
120
+ )
121
+ print(f"Processed scene: {scene}")
122
+
123
+
124
+ def main(args):
125
+ root = args.root
126
+ max_interval = args.max_interval
127
+ num_workers = args.num_workers
128
+
129
+ # Load the list of scenes from the 'all_metadata.npz' file.
130
+ all_metadata_path = osp.join(root, "all_metadata.npz")
131
+ with np.load(all_metadata_path, allow_pickle=True) as data:
132
+ scenes = data["scenes"]
133
+
134
+ # Process scenes in parallel.
135
+ futures = []
136
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
137
+ for scene in scenes:
138
+ futures.append(executor.submit(process_scene, root, scene, max_interval))
139
+ for future in tqdm(
140
+ as_completed(futures), total=len(futures), desc="Processing scenes"
141
+ ):
142
+ # This will raise any exceptions from process_scene.
143
+ future.result()
144
+
145
+
146
+ if __name__ == "__main__":
147
+ parser = argparse.ArgumentParser(
148
+ description="Preprocess processed_scannetpp scenes to update scene metadata."
149
+ )
150
+ parser.add_argument(
151
+ "--root",
152
+ type=str,
153
+ required=True,
154
+ help="Root directory containing processed_scannetpp scene folders.",
155
+ )
156
+ parser.add_argument(
157
+ "--max_interval",
158
+ type=int,
159
+ default=150,
160
+ help="Maximum timestamp interval for grouping images (default: 150).",
161
+ )
162
+ parser.add_argument(
163
+ "--num_workers",
164
+ type=int,
165
+ default=8,
166
+ help="Number of worker threads for parallel processing (default: 8).",
167
+ )
168
+ args = parser.parse_args()
169
+ main(args)
extern/CUT3R/datasets_preprocess/merge_dl3dv.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from tqdm import tqdm
4
+
5
+ # Set these paths to your original and moved locations.
6
+ src_base = "/path/to/processed_dl3dv" # original location
7
+ dst_base = "processed_dl3dv_ours" # current (moved) location
8
+
9
+ # Set dry_run to True for testing (no changes made), and False to perform the actions.
10
+ dry_run = False
11
+
12
+ def merge_directories(source_dir, destination_dir, dry_run=False):
13
+ """
14
+ Merge all contents from source_dir into destination_dir.
15
+ If an item already exists in destination_dir:
16
+ - For files: remove the destination file and move the source file.
17
+ - For directories: merge them recursively.
18
+ After moving items, empty directories are removed.
19
+ """
20
+ for item in os.listdir(source_dir):
21
+ source_item = os.path.join(source_dir, item)
22
+ dest_item = os.path.join(destination_dir, item)
23
+ if os.path.isdir(source_item):
24
+ if os.path.exists(dest_item):
25
+ # Recursively merge subdirectories.
26
+ merge_directories(source_item, dest_item, dry_run=dry_run)
27
+ # Remove the source subdirectory if empty.
28
+ if not os.listdir(source_item):
29
+ if dry_run:
30
+ print(f"[Dry-run] Would remove empty directory: {source_item}")
31
+ else:
32
+ os.rmdir(source_item)
33
+ else:
34
+ if dry_run:
35
+ print(f"[Dry-run] Would move directory: {source_item} -> {dest_item}")
36
+ else:
37
+ shutil.move(source_item, dest_item)
38
+ else:
39
+ # For files: if a file already exists at the destination, remove it.
40
+ if os.path.exists(dest_item):
41
+ if dry_run:
42
+ print(f"[Dry-run] Would remove existing file: {dest_item}")
43
+ else:
44
+ os.remove(dest_item)
45
+ if dry_run:
46
+ print(f"[Dry-run] Would move file: {source_item} -> {dest_item}")
47
+ else:
48
+ shutil.move(source_item, dest_item)
49
+
50
+ # Build a list of relative folder paths in dst_base.
51
+ # This assumes the structure is: dst_base/f1/f2
52
+ all_folders = []
53
+ for f1 in os.listdir(dst_base):
54
+ f1_path = os.path.join(dst_base, f1)
55
+ if not os.path.isdir(f1_path):
56
+ continue
57
+ for f2 in os.listdir(f1_path):
58
+ all_folders.append(os.path.join(f1, f2))
59
+
60
+ # Process each folder and move/merge it back to the original location.
61
+ for folder in tqdm(all_folders, desc="Moving folders back"):
62
+ original_folder = os.path.join(src_base, folder) # target location in the original path
63
+ moved_folder = os.path.join(dst_base, folder) # current location
64
+
65
+ # Ensure the parent directory of the original folder exists.
66
+ parent_dir = os.path.dirname(original_folder)
67
+ if dry_run:
68
+ if not os.path.exists(parent_dir):
69
+ print(f"[Dry-run] Would create directory: {parent_dir}")
70
+ else:
71
+ os.makedirs(parent_dir, exist_ok=True)
72
+
73
+ if not os.path.exists(original_folder):
74
+ if dry_run:
75
+ print(f"[Dry-run] Would move folder: {moved_folder} -> {original_folder}")
76
+ else:
77
+ shutil.move(moved_folder, original_folder)
78
+ else:
79
+ merge_directories(moved_folder, original_folder, dry_run=dry_run)
80
+ # Remove the moved folder if it becomes empty.
81
+ if not os.listdir(moved_folder):
82
+ if dry_run:
83
+ print(f"[Dry-run] Would remove empty directory: {moved_folder}")
84
+ else:
85
+ os.rmdir(moved_folder)
extern/CUT3R/datasets_preprocess/path_to_root.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
3
+ #
4
+ # --------------------------------------------------------
5
+ # DUSt3R repo root import
6
+ # --------------------------------------------------------
7
+
8
+ import sys
9
+ import os.path as path
10
+
11
+ HERE_PATH = path.normpath(path.dirname(__file__))
12
+ DUST3R_REPO_PATH = path.normpath(path.join(HERE_PATH, "../"))
13
+ # workaround for sibling import
14
+ sys.path.insert(0, DUST3R_REPO_PATH)
extern/CUT3R/datasets_preprocess/preprocess_3dkb.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Process 3D Ken Burns data by selecting random view types, copying images and depth files,
4
+ and computing camera intrinsics from a field-of-view value. The output files are stored in an
5
+ organized folder structure.
6
+
7
+ Usage:
8
+ python preprocess_3dkb.py --root /path/to/data_3d_ken_burns \
9
+ --out_dir /path/to/processed_3dkb \
10
+ [--num_workers 4] [--seed 42]
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import random
16
+ import shutil
17
+ from functools import partial
18
+ from pathlib import Path
19
+ import argparse
20
+
21
+ import cv2 # noqa: F401; cv2 is imported to ensure OpenEXR support.
22
+ import numpy as np
23
+ from PIL import Image
24
+ from tqdm import tqdm
25
+ from concurrent.futures import ProcessPoolExecutor, as_completed
26
+
27
+ # Ensure OpenCV can read OpenEXR files.
28
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
29
+
30
+
31
+ def fov_to_intrinsic_matrix(width, height, fov_deg, fov_type="horizontal"):
32
+ """
33
+ Converts field of view (FOV) in degrees to a camera intrinsic matrix.
34
+
35
+ Args:
36
+ width (int): Image width in pixels.
37
+ height (int): Image height in pixels.
38
+ fov_deg (float): Field of view in degrees.
39
+ fov_type (str): 'horizontal' or 'vertical'; determines which FOV is used.
40
+
41
+ Returns:
42
+ np.ndarray: A 3x3 camera intrinsic matrix.
43
+
44
+ Raises:
45
+ ValueError: If width or height is non-positive or if fov_deg is not in (0, 180).
46
+ """
47
+ if width <= 0 or height <= 0:
48
+ raise ValueError("Image width and height must be positive numbers.")
49
+ if not (0 < fov_deg < 180):
50
+ raise ValueError("FOV must be between 0 and 180 degrees (non-inclusive).")
51
+ if fov_type not in ["horizontal", "vertical"]:
52
+ raise ValueError("fov_type must be either 'horizontal' or 'vertical'.")
53
+
54
+ fov_rad = np.deg2rad(fov_deg)
55
+
56
+ if fov_type == "horizontal":
57
+ f_x = width / (2 * np.tan(fov_rad / 2))
58
+ aspect_ratio = height / width
59
+ f_y = f_x * aspect_ratio
60
+ else:
61
+ f_y = height / (2 * np.tan(fov_rad / 2))
62
+ aspect_ratio = width / height
63
+ f_x = f_y * aspect_ratio
64
+
65
+ c_x = width / 2
66
+ c_y = height / 2
67
+ K = np.array([[f_x, 0, c_x], [0, f_y, c_y], [0, 0, 1]])
68
+ return K
69
+
70
+
71
+ def process_basename(root, seq, basename, view_types, out_dir):
72
+ """
73
+ Processes a single basename: selects a random view type, copies the corresponding
74
+ image and depth file, and computes the camera intrinsics from the JSON metadata.
75
+
76
+ Args:
77
+ root (str): Root directory of the raw data.
78
+ seq (str): Sequence directory name.
79
+ basename (str): Basename (common identifier) for the files.
80
+ view_types (list): List of view types to choose from (e.g. ['bl', 'br', 'tl', 'tr']).
81
+ out_dir (str): Output directory where processed data will be saved.
82
+
83
+ Returns:
84
+ str or None: Returns an error message string on failure; otherwise, returns None.
85
+ """
86
+ # Select a random view type.
87
+ view_type = random.choice(view_types)
88
+
89
+ imgname = f"{basename}-{view_type}-image.png"
90
+ depthname = f"{basename}-{view_type}-depth.exr"
91
+
92
+ img_path = os.path.join(root, seq, imgname)
93
+ cam_path = os.path.join(root, seq, f"{basename}-meta.json")
94
+ depth_path = os.path.join(root, f"{seq}-depth", depthname)
95
+
96
+ # Prepare output directories.
97
+ out_seq_dir = os.path.join(out_dir, seq)
98
+ out_rgb_dir = os.path.join(out_seq_dir, "rgb")
99
+ out_depth_dir = os.path.join(out_seq_dir, "depth")
100
+ out_cam_dir = os.path.join(out_seq_dir, "cam")
101
+
102
+ # Output file paths.
103
+ out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
104
+ out_depth_path = os.path.join(out_depth_dir, f"{basename}.exr")
105
+ out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
106
+
107
+ try:
108
+ # Load image using PIL and save as PNG.
109
+ with Image.open(img_path) as img:
110
+ W, H = img.size
111
+ img.save(out_img_path, format="PNG")
112
+
113
+ # Load camera JSON metadata.
114
+ with open(cam_path, "r") as f:
115
+ cam = json.load(f)
116
+ fov = cam["fltFov"]
117
+ K = fov_to_intrinsic_matrix(W, H, fov)
118
+
119
+ # Copy depth file.
120
+ shutil.copy(depth_path, out_depth_path)
121
+
122
+ # Save camera intrinsics.
123
+ np.savez(out_cam_path, intrinsics=K)
124
+
125
+ except Exception as e:
126
+ return f"Error processing {seq}/{basename}: {e}"
127
+
128
+ return None # Success indicator
129
+
130
+
131
+ def main():
132
+ parser = argparse.ArgumentParser(
133
+ description="Process raw 3D Ken Burns video data and generate processed images, depth maps, and camera intrinsics."
134
+ )
135
+ parser.add_argument(
136
+ "--root", type=str, required=True, help="Root directory of the raw data."
137
+ )
138
+ parser.add_argument(
139
+ "--out_dir",
140
+ type=str,
141
+ required=True,
142
+ help="Output directory for processed data.",
143
+ )
144
+ parser.add_argument(
145
+ "--num_workers",
146
+ type=int,
147
+ default=None,
148
+ help="Number of worker processes to use (default: half of available CPUs).",
149
+ )
150
+ parser.add_argument(
151
+ "--seed",
152
+ type=int,
153
+ default=42,
154
+ help="Random seed for reproducibility (default: 42).",
155
+ )
156
+ parser.add_argument(
157
+ "--view_types",
158
+ type=str,
159
+ nargs="+",
160
+ default=["bl", "br", "tl", "tr"],
161
+ help="List of view types to choose from (default: bl br tl tr).",
162
+ )
163
+ args = parser.parse_args()
164
+
165
+ # Set the random seed.
166
+ random.seed(args.seed)
167
+
168
+ root = args.root
169
+ out_dir = args.out_dir
170
+ view_types = args.view_types
171
+
172
+ # Determine number of worker processes.
173
+ num_workers = (
174
+ args.num_workers if args.num_workers is not None else (os.cpu_count() or 4) // 2
175
+ )
176
+
177
+ # Collect all sequence directories from root.
178
+ seq_dirs = [
179
+ d
180
+ for d in os.listdir(root)
181
+ if os.path.isdir(os.path.join(root, d)) and not d.endswith("-depth")
182
+ ]
183
+
184
+ # Pre-create output directory structure.
185
+ for seq in seq_dirs:
186
+ for subfolder in ["rgb", "depth", "cam"]:
187
+ (Path(out_dir) / seq / subfolder).mkdir(parents=True, exist_ok=True)
188
+
189
+ # Prepare list of tasks.
190
+ tasks = []
191
+ for seq in seq_dirs:
192
+ seq_path = os.path.join(root, seq)
193
+ # Assume JSON files contain metadata and have a name ending with "-meta.json".
194
+ json_files = [f for f in os.listdir(seq_path) if f.endswith(".json")]
195
+ # Remove the trailing "-meta.json" (10 characters) to get the basename.
196
+ basenames = sorted([f[:-10] for f in json_files])
197
+ for basename in basenames:
198
+ tasks.append((seq, basename))
199
+
200
+ # Define a partial function with fixed root, view_types, and out_dir.
201
+ process_func = partial(
202
+ process_basename, root, view_types=view_types, out_dir=out_dir
203
+ )
204
+
205
+ # Process tasks in parallel using ProcessPoolExecutor.
206
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
207
+ futures = {
208
+ executor.submit(process_func, seq, basename): (seq, basename)
209
+ for seq, basename in tasks
210
+ }
211
+ for future in tqdm(
212
+ as_completed(futures), total=len(futures), desc="Processing"
213
+ ):
214
+ error = future.result()
215
+ if error:
216
+ print(error)
217
+
218
+
219
+ if __name__ == "__main__":
220
+ main()
extern/CUT3R/datasets_preprocess/preprocess_arkitscenes.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import os.path as osp
4
+ import decimal
5
+ import argparse
6
+ import math
7
+ from bisect import bisect_left
8
+ from PIL import Image
9
+ import numpy as np
10
+ import quaternion
11
+ from scipy import interpolate
12
+ import cv2
13
+ from tqdm import tqdm
14
+
15
+
16
+ def get_parser():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--arkitscenes_dir",
20
+ default="data/dust3r_data/data_arkitscenes/raw",
21
+ )
22
+ parser.add_argument(
23
+ "--precomputed_pairs",
24
+ default="data/dust3r_data/data_arkitscenes/arkitscenes_pairs",
25
+ )
26
+ parser.add_argument(
27
+ "--output_dir",
28
+ default="data/dust3r_data/processed_arkitscenes",
29
+ )
30
+ return parser
31
+
32
+
33
+ def value_to_decimal(value, decimal_places):
34
+ decimal.getcontext().rounding = decimal.ROUND_HALF_UP # define rounding method
35
+ return decimal.Decimal(str(float(value))).quantize(
36
+ decimal.Decimal("1e-{}".format(decimal_places))
37
+ )
38
+
39
+
40
+ def closest(value, sorted_list):
41
+ index = bisect_left(sorted_list, value)
42
+ if index == 0:
43
+ return sorted_list[0]
44
+ elif index == len(sorted_list):
45
+ return sorted_list[-1]
46
+ else:
47
+ value_before = sorted_list[index - 1]
48
+ value_after = sorted_list[index]
49
+ if value_after - value < value - value_before:
50
+ return value_after
51
+ else:
52
+ return value_before
53
+
54
+
55
+ def get_up_vectors(pose_device_to_world):
56
+ return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))
57
+
58
+
59
+ def get_right_vectors(pose_device_to_world):
60
+ return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))
61
+
62
+
63
+ def read_traj(traj_path):
64
+ quaternions = []
65
+ poses = []
66
+ timestamps = []
67
+ poses_p_to_w = []
68
+ with open(traj_path) as f:
69
+ traj_lines = f.readlines()
70
+ for line in traj_lines:
71
+ tokens = line.split()
72
+ assert len(tokens) == 7
73
+ traj_timestamp = float(tokens[0])
74
+
75
+ timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
76
+ timestamps.append(
77
+ float(timestamps_decimal_value)
78
+ ) # for spline interpolation
79
+
80
+ angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
81
+ r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
82
+ t_w_to_p = np.asarray(
83
+ [float(tokens[4]), float(tokens[5]), float(tokens[6])]
84
+ )
85
+
86
+ pose_w_to_p = np.eye(4)
87
+ pose_w_to_p[:3, :3] = r_w_to_p
88
+ pose_w_to_p[:3, 3] = t_w_to_p
89
+
90
+ pose_p_to_w = np.linalg.inv(pose_w_to_p)
91
+
92
+ r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
93
+ t_p_to_w = pose_p_to_w[:3, 3]
94
+ poses_p_to_w.append(pose_p_to_w)
95
+ poses.append(t_p_to_w)
96
+ quaternions.append(r_p_to_w_as_quat)
97
+ return timestamps, poses, quaternions, poses_p_to_w
98
+
99
+
100
+ def main(rootdir, pairsdir, outdir):
101
+ os.makedirs(outdir, exist_ok=True)
102
+
103
+ subdirs = ["Test", "Training"]
104
+ for subdir in subdirs:
105
+ # STEP 1: list all scenes
106
+ outsubdir = osp.join(outdir, subdir)
107
+ os.makedirs(outsubdir, exist_ok=True)
108
+ listfile = osp.join(pairsdir, subdir, "scene_list.json")
109
+ with open(listfile, "r") as f:
110
+ scene_dirs = json.load(f)
111
+
112
+ valid_scenes = []
113
+ for scene_subdir in tqdm(scene_dirs):
114
+ if not os.path.isdir(osp.join(rootdir, "Test", scene_subdir)):
115
+ if not os.path.isdir(osp.join(rootdir, "Training", scene_subdir)):
116
+ continue
117
+ else:
118
+ root_subdir = "Training"
119
+ else:
120
+ root_subdir = "Test"
121
+ out_scene_subdir = osp.join(outsubdir, scene_subdir)
122
+ os.makedirs(out_scene_subdir, exist_ok=True)
123
+
124
+ scene_dir = osp.join(rootdir, root_subdir, scene_subdir)
125
+ depth_dir = osp.join(scene_dir, "lowres_depth")
126
+ rgb_dir = osp.join(scene_dir, "vga_wide")
127
+ intrinsics_dir = osp.join(scene_dir, "vga_wide_intrinsics")
128
+ traj_path = osp.join(scene_dir, "lowres_wide.traj")
129
+
130
+ # STEP 2: read selected_pairs.npz
131
+ selected_pairs_path = osp.join(
132
+ pairsdir, subdir, scene_subdir, "selected_pairs.npz"
133
+ )
134
+ selected_npz = np.load(selected_pairs_path)
135
+ selection, pairs = selected_npz["selection"], selected_npz["pairs"]
136
+ selected_sky_direction_scene = str(selected_npz["sky_direction_scene"][0])
137
+ if len(selection) == 0 or len(pairs) == 0:
138
+ # not a valid scene
139
+ continue
140
+ valid_scenes.append(scene_subdir)
141
+
142
+ # STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
143
+ scene_metadata_path = osp.join(out_scene_subdir, "scene_metadata.npz")
144
+ if osp.isfile(scene_metadata_path):
145
+ continue
146
+ else:
147
+ print(f"parsing {scene_subdir}")
148
+ # loads traj
149
+ timestamps, poses, quaternions, poses_cam_to_world = read_traj(
150
+ traj_path
151
+ )
152
+
153
+ poses = np.array(poses)
154
+ quaternions = np.array(quaternions, dtype=np.quaternion)
155
+ quaternions = quaternion.unflip_rotors(quaternions)
156
+ timestamps = np.array(timestamps)
157
+
158
+ selected_images = [
159
+ (basename, basename.split(".png")[0].split("_")[1])
160
+ for basename in selection
161
+ ]
162
+ timestamps_selected = [
163
+ float(frame_id) for _, frame_id in selected_images
164
+ ]
165
+
166
+ sky_direction_scene, trajectories, intrinsics, images = (
167
+ convert_scene_metadata(
168
+ scene_subdir,
169
+ intrinsics_dir,
170
+ timestamps,
171
+ quaternions,
172
+ poses,
173
+ poses_cam_to_world,
174
+ selected_images,
175
+ timestamps_selected,
176
+ )
177
+ )
178
+ assert selected_sky_direction_scene == sky_direction_scene
179
+
180
+ os.makedirs(os.path.join(out_scene_subdir, "vga_wide"), exist_ok=True)
181
+ os.makedirs(
182
+ os.path.join(out_scene_subdir, "lowres_depth"), exist_ok=True
183
+ )
184
+ assert isinstance(sky_direction_scene, str)
185
+ all_exist = True
186
+ for basename in images:
187
+ vga_wide_path = osp.join(rgb_dir, basename)
188
+ depth_path = osp.join(depth_dir, basename)
189
+ if not osp.isfile(vga_wide_path) or not osp.isfile(depth_path):
190
+ all_exist = False
191
+ break
192
+ if not all_exist:
193
+ continue
194
+
195
+ for basename in images:
196
+ img_out = os.path.join(
197
+ out_scene_subdir, "vga_wide", basename.replace(".png", ".jpg")
198
+ )
199
+ depth_out = os.path.join(out_scene_subdir, "lowres_depth", basename)
200
+ if osp.isfile(img_out) and osp.isfile(depth_out):
201
+ continue
202
+
203
+ vga_wide_path = osp.join(rgb_dir, basename)
204
+ depth_path = osp.join(depth_dir, basename)
205
+
206
+ img = Image.open(vga_wide_path)
207
+ depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
208
+
209
+ # rotate the image
210
+ if sky_direction_scene == "RIGHT":
211
+ try:
212
+ img = img.transpose(Image.Transpose.ROTATE_90)
213
+ except Exception:
214
+ img = img.transpose(Image.ROTATE_90)
215
+ depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)
216
+ elif sky_direction_scene == "LEFT":
217
+ try:
218
+ img = img.transpose(Image.Transpose.ROTATE_270)
219
+ except Exception:
220
+ img = img.transpose(Image.ROTATE_270)
221
+ depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
222
+ elif sky_direction_scene == "DOWN":
223
+ try:
224
+ img = img.transpose(Image.Transpose.ROTATE_180)
225
+ except Exception:
226
+ img = img.transpose(Image.ROTATE_180)
227
+ depth = cv2.rotate(depth, cv2.ROTATE_180)
228
+
229
+ W, H = img.size
230
+ if not osp.isfile(img_out):
231
+ img.save(img_out)
232
+
233
+ depth = cv2.resize(
234
+ depth, (W, H), interpolation=cv2.INTER_NEAREST_EXACT
235
+ )
236
+ if not osp.isfile(
237
+ depth_out
238
+ ): # avoid destroying the base dataset when you mess up the paths
239
+ cv2.imwrite(depth_out, depth)
240
+
241
+ # save at the end
242
+ np.savez(
243
+ scene_metadata_path,
244
+ trajectories=trajectories,
245
+ intrinsics=intrinsics,
246
+ images=images,
247
+ pairs=pairs,
248
+ )
249
+
250
+ outlistfile = osp.join(outsubdir, "scene_list.json")
251
+ for scene_subdir in valid_scenes:
252
+ scene_metadata_path = osp.join(
253
+ outsubdir, scene_subdir, "scene_metadata.npz"
254
+ )
255
+ if not osp.isfile(scene_metadata_path):
256
+ valid_scenes.remove(scene_subdir)
257
+ with open(outlistfile, "w") as f:
258
+ json.dump(valid_scenes, f)
259
+
260
+ # STEP 5: concat all scene_metadata.npz into a single file
261
+ scene_data = {}
262
+ for scene_subdir in valid_scenes:
263
+ scene_metadata_path = osp.join(
264
+ outsubdir, scene_subdir, "scene_metadata.npz"
265
+ )
266
+ with np.load(scene_metadata_path) as data:
267
+ trajectories = data["trajectories"]
268
+ intrinsics = data["intrinsics"]
269
+ images = data["images"]
270
+ pairs = data["pairs"]
271
+ scene_data[scene_subdir] = {
272
+ "trajectories": trajectories,
273
+ "intrinsics": intrinsics,
274
+ "images": images,
275
+ "pairs": pairs,
276
+ }
277
+ offset = 0
278
+ counts = []
279
+ scenes = []
280
+ sceneids = []
281
+ images = []
282
+ intrinsics = []
283
+ trajectories = []
284
+ pairs = []
285
+ for scene_idx, (scene_subdir, data) in enumerate(scene_data.items()):
286
+ num_imgs = data["images"].shape[0]
287
+ img_pairs = data["pairs"]
288
+
289
+ scenes.append(scene_subdir)
290
+ sceneids.extend([scene_idx] * num_imgs)
291
+
292
+ images.append(data["images"])
293
+
294
+ K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
295
+ K[:, 0, 0] = [fx for _, _, fx, _, _, _ in data["intrinsics"]]
296
+ K[:, 1, 1] = [fy for _, _, _, fy, _, _ in data["intrinsics"]]
297
+ K[:, 0, 2] = [hw for _, _, _, _, hw, _ in data["intrinsics"]]
298
+ K[:, 1, 2] = [hh for _, _, _, _, _, hh in data["intrinsics"]]
299
+
300
+ intrinsics.append(K)
301
+ trajectories.append(data["trajectories"])
302
+
303
+ # offset pairs
304
+ img_pairs[:, 0:2] += offset
305
+ pairs.append(img_pairs)
306
+ counts.append(offset)
307
+
308
+ offset += num_imgs
309
+
310
+ images = np.concatenate(images, axis=0)
311
+ intrinsics = np.concatenate(intrinsics, axis=0)
312
+ trajectories = np.concatenate(trajectories, axis=0)
313
+ pairs = np.concatenate(pairs, axis=0)
314
+ np.savez(
315
+ osp.join(outsubdir, "all_metadata.npz"),
316
+ counts=counts,
317
+ scenes=scenes,
318
+ sceneids=sceneids,
319
+ images=images,
320
+ intrinsics=intrinsics,
321
+ trajectories=trajectories,
322
+ pairs=pairs,
323
+ )
324
+
325
+
326
+ def convert_scene_metadata(
327
+ scene_subdir,
328
+ intrinsics_dir,
329
+ timestamps,
330
+ quaternions,
331
+ poses,
332
+ poses_cam_to_world,
333
+ selected_images,
334
+ timestamps_selected,
335
+ ):
336
+ # find scene orientation
337
+ sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)
338
+
339
+ # find/compute pose for selected timestamps
340
+ # most images have a valid timestamp / exact pose associated
341
+ timestamps_selected = np.array(timestamps_selected)
342
+ spline = interpolate.interp1d(timestamps, poses, kind="linear", axis=0)
343
+ interpolated_rotations = quaternion.squad(
344
+ quaternions, timestamps, timestamps_selected
345
+ )
346
+ interpolated_positions = spline(timestamps_selected)
347
+
348
+ trajectories = []
349
+ intrinsics = []
350
+ images = []
351
+ for i, (basename, frame_id) in enumerate(selected_images):
352
+ intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
353
+ if not osp.exists(intrinsic_fn):
354
+ intrinsic_fn = osp.join(
355
+ intrinsics_dir, f"{scene_subdir}_{float(frame_id) - 0.001:.3f}.pincam"
356
+ )
357
+ if not osp.exists(intrinsic_fn):
358
+ intrinsic_fn = osp.join(
359
+ intrinsics_dir, f"{scene_subdir}_{float(frame_id) + 0.001:.3f}.pincam"
360
+ )
361
+ assert osp.exists(intrinsic_fn)
362
+ w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn) # PINHOLE
363
+
364
+ pose = np.eye(4)
365
+ pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
366
+ pose[:3, 3] = interpolated_positions[i]
367
+
368
+ images.append(basename)
369
+ if sky_direction_scene == "RIGHT" or sky_direction_scene == "LEFT":
370
+ intrinsics.append([h, w, fy, fx, hh, hw]) # swapped intrinsics
371
+ else:
372
+ intrinsics.append([w, h, fx, fy, hw, hh])
373
+ trajectories.append(
374
+ pose @ rotated_to_cam
375
+ ) # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world
376
+
377
+ return sky_direction_scene, trajectories, intrinsics, images
378
+
379
+
380
+ def find_scene_orientation(poses_cam_to_world):
381
+ if len(poses_cam_to_world) > 0:
382
+ up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(
383
+ poses_cam_to_world
384
+ )
385
+ right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(
386
+ poses_cam_to_world
387
+ )
388
+ up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
389
+ else:
390
+ up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
391
+ right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
392
+ up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
393
+
394
+ # value between 0, 180
395
+ device_up_to_world_up_angle = (
396
+ np.arccos(np.clip(np.dot(np.transpose(up_world), up_vector), -1.0, 1.0)).item()
397
+ * 180.0
398
+ / np.pi
399
+ )
400
+ device_right_to_world_up_angle = (
401
+ np.arccos(
402
+ np.clip(np.dot(np.transpose(up_world), right_vector), -1.0, 1.0)
403
+ ).item()
404
+ * 180.0
405
+ / np.pi
406
+ )
407
+
408
+ up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(
409
+ device_right_to_world_up_angle - 90.0
410
+ )
411
+ if up_closest_to_90:
412
+ assert abs(device_up_to_world_up_angle - 90.0) < 45.0
413
+ # LEFT
414
+ if device_right_to_world_up_angle > 90.0:
415
+ sky_direction_scene = "LEFT"
416
+ cam_to_rotated_q = quaternion.from_rotation_vector(
417
+ [0.0, 0.0, math.pi / 2.0]
418
+ )
419
+ else:
420
+ # note that in metadata.csv RIGHT does not exist, but again it's not accurate...
421
+ # well, turns out there are scenes oriented like this
422
+ # for example Training/41124801
423
+ sky_direction_scene = "RIGHT"
424
+ cam_to_rotated_q = quaternion.from_rotation_vector(
425
+ [0.0, 0.0, -math.pi / 2.0]
426
+ )
427
+ else:
428
+ # right is close to 90
429
+ assert abs(device_right_to_world_up_angle - 90.0) < 45.0
430
+ if device_up_to_world_up_angle > 90.0:
431
+ sky_direction_scene = "DOWN"
432
+ cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
433
+ else:
434
+ sky_direction_scene = "UP"
435
+ cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
436
+ cam_to_rotated = np.eye(4)
437
+ cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
438
+ rotated_to_cam = np.linalg.inv(cam_to_rotated)
439
+ return sky_direction_scene, rotated_to_cam
440
+
441
+
442
+ if __name__ == "__main__":
443
+ parser = get_parser()
444
+ args = parser.parse_args()
445
+ main(args.arkitscenes_dir, args.precomputed_pairs, args.output_dir)
extern/CUT3R/datasets_preprocess/preprocess_arkitscenes_highres.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import os.path as osp
4
+ import decimal
5
+ import argparse
6
+ import math
7
+ from bisect import bisect_left
8
+ from PIL import Image
9
+ import numpy as np
10
+ import quaternion
11
+ from scipy import interpolate
12
+ import cv2
13
+ from tqdm import tqdm
14
+ from multiprocessing import Pool
15
+
16
+
17
+ def get_parser():
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument(
20
+ "--arkitscenes_dir",
21
+ default="",
22
+ )
23
+ parser.add_argument(
24
+ "--output_dir",
25
+ default="data/dust3r_data/processed_arkitscenes_highres",
26
+ )
27
+ return parser
28
+
29
+
30
+ def value_to_decimal(value, decimal_places):
31
+ decimal.getcontext().rounding = decimal.ROUND_HALF_UP # define rounding method
32
+ return decimal.Decimal(str(float(value))).quantize(
33
+ decimal.Decimal("1e-{}".format(decimal_places))
34
+ )
35
+
36
+
37
+ def closest(value, sorted_list):
38
+ index = bisect_left(sorted_list, value)
39
+ if index == 0:
40
+ return sorted_list[0]
41
+ elif index == len(sorted_list):
42
+ return sorted_list[-1]
43
+ else:
44
+ value_before = sorted_list[index - 1]
45
+ value_after = sorted_list[index]
46
+ if value_after - value < value - value_before:
47
+ return value_after
48
+ else:
49
+ return value_before
50
+
51
+
52
+ def get_up_vectors(pose_device_to_world):
53
+ return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))
54
+
55
+
56
+ def get_right_vectors(pose_device_to_world):
57
+ return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))
58
+
59
+
60
+ def read_traj(traj_path):
61
+ quaternions = []
62
+ poses = []
63
+ timestamps = []
64
+ poses_p_to_w = []
65
+ with open(traj_path) as f:
66
+ traj_lines = f.readlines()
67
+ for line in traj_lines:
68
+ tokens = line.split()
69
+ assert len(tokens) == 7
70
+ traj_timestamp = float(tokens[0])
71
+
72
+ timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
73
+ timestamps.append(
74
+ float(timestamps_decimal_value)
75
+ ) # for spline interpolation
76
+
77
+ angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
78
+ r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
79
+ t_w_to_p = np.asarray(
80
+ [float(tokens[4]), float(tokens[5]), float(tokens[6])]
81
+ )
82
+
83
+ pose_w_to_p = np.eye(4)
84
+ pose_w_to_p[:3, :3] = r_w_to_p
85
+ pose_w_to_p[:3, 3] = t_w_to_p
86
+
87
+ pose_p_to_w = np.linalg.inv(pose_w_to_p)
88
+
89
+ r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
90
+ t_p_to_w = pose_p_to_w[:3, 3]
91
+ poses_p_to_w.append(pose_p_to_w)
92
+ poses.append(t_p_to_w)
93
+ quaternions.append(r_p_to_w_as_quat)
94
+ return timestamps, poses, quaternions, poses_p_to_w
95
+
96
+
97
+ def main(rootdir, outdir):
98
+ os.makedirs(outdir, exist_ok=True)
99
+ subdirs = ["Validation", "Training"]
100
+ for subdir in subdirs:
101
+ outsubdir = osp.join(outdir, subdir)
102
+ scene_dirs = sorted(
103
+ [
104
+ d
105
+ for d in os.listdir(osp.join(rootdir, subdir))
106
+ if osp.isdir(osp.join(rootdir, subdir, d))
107
+ ]
108
+ )
109
+
110
+ with Pool() as pool:
111
+ results = list(
112
+ tqdm(
113
+ pool.imap(
114
+ process_scene,
115
+ [
116
+ (rootdir, outdir, subdir, scene_subdir)
117
+ for scene_subdir in scene_dirs
118
+ ],
119
+ ),
120
+ total=len(scene_dirs),
121
+ )
122
+ )
123
+
124
+ # Filter None results and other post-processing
125
+ valid_scenes = [result for result in results if result is not None]
126
+ outlistfile = osp.join(outsubdir, "scene_list.json")
127
+ with open(outlistfile, "w") as f:
128
+ json.dump(valid_scenes, f)
129
+
130
+
131
+ def process_scene(args):
132
+ rootdir, outdir, subdir, scene_subdir = args
133
+ # Unpack paths
134
+ scene_dir = osp.join(rootdir, subdir, scene_subdir)
135
+ outsubdir = osp.join(outdir, subdir)
136
+ out_scene_subdir = osp.join(outsubdir, scene_subdir)
137
+
138
+ # Validation if necessary resources exist
139
+ if (
140
+ not osp.exists(osp.join(scene_dir, "highres_depth"))
141
+ or not osp.exists(osp.join(scene_dir, "vga_wide"))
142
+ or not osp.exists(osp.join(scene_dir, "vga_wide_intrinsics"))
143
+ or not osp.exists(osp.join(scene_dir, "lowres_wide.traj"))
144
+ ):
145
+ return None
146
+
147
+ depth_dir = osp.join(scene_dir, "highres_depth")
148
+ rgb_dir = osp.join(scene_dir, "vga_wide")
149
+ intrinsics_dir = osp.join(scene_dir, "vga_wide_intrinsics")
150
+ traj_path = osp.join(scene_dir, "lowres_wide.traj")
151
+
152
+ depth_files = sorted(os.listdir(depth_dir))
153
+ img_files = sorted(os.listdir(rgb_dir))
154
+
155
+ out_scene_subdir = osp.join(outsubdir, scene_subdir)
156
+
157
+ # STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
158
+ scene_metadata_path = osp.join(out_scene_subdir, "scene_metadata.npz")
159
+ if osp.isfile(scene_metadata_path):
160
+ print(f"Skipping {scene_subdir}")
161
+ else:
162
+ print(f"parsing {scene_subdir}")
163
+ # loads traj
164
+ timestamps, poses, quaternions, poses_cam_to_world = read_traj(traj_path)
165
+
166
+ poses = np.array(poses)
167
+ quaternions = np.array(quaternions, dtype=np.quaternion)
168
+ quaternions = quaternion.unflip_rotors(quaternions)
169
+ timestamps = np.array(timestamps)
170
+
171
+ all_depths = sorted(
172
+ [
173
+ (basename, basename.split(".png")[0].split("_")[1])
174
+ for basename in depth_files
175
+ ],
176
+ key=lambda x: float(x[1]),
177
+ )
178
+
179
+ selected_depths = []
180
+ timestamps_selected = []
181
+ timestamp_min = timestamps.min()
182
+ timestamp_max = timestamps.max()
183
+ for basename, frame_id in all_depths:
184
+ frame_id = float(frame_id)
185
+ if frame_id < timestamp_min or frame_id > timestamp_max:
186
+ continue
187
+ selected_depths.append((basename, frame_id))
188
+ timestamps_selected.append(frame_id)
189
+
190
+ sky_direction_scene, trajectories, intrinsics, images, depths = (
191
+ convert_scene_metadata(
192
+ scene_subdir,
193
+ intrinsics_dir,
194
+ timestamps,
195
+ quaternions,
196
+ poses,
197
+ poses_cam_to_world,
198
+ img_files,
199
+ selected_depths,
200
+ timestamps_selected,
201
+ )
202
+ )
203
+
204
+ if len(images) == 0:
205
+ print(f"Skipping {scene_subdir}")
206
+ return None
207
+
208
+ os.makedirs(out_scene_subdir, exist_ok=True)
209
+
210
+ os.makedirs(os.path.join(out_scene_subdir, "vga_wide"), exist_ok=True)
211
+ os.makedirs(os.path.join(out_scene_subdir, "highres_depth"), exist_ok=True)
212
+ assert isinstance(sky_direction_scene, str)
213
+
214
+ for image_path, depth_path in zip(images, depths):
215
+ img_out = os.path.join(
216
+ out_scene_subdir, "vga_wide", image_path.replace(".png", ".jpg")
217
+ )
218
+ depth_out = os.path.join(out_scene_subdir, "highres_depth", depth_path)
219
+ if osp.isfile(img_out) and osp.isfile(depth_out):
220
+ continue
221
+
222
+ vga_wide_path = osp.join(rgb_dir, image_path)
223
+ depth_path = osp.join(depth_dir, depth_path)
224
+
225
+ if not osp.isfile(vga_wide_path) or not osp.isfile(depth_path):
226
+ continue
227
+
228
+ img = Image.open(vga_wide_path)
229
+ depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
230
+
231
+ # rotate the image
232
+ if sky_direction_scene == "RIGHT":
233
+ try:
234
+ img = img.transpose(Image.Transpose.ROTATE_90)
235
+ except Exception:
236
+ img = img.transpose(Image.ROTATE_90)
237
+ depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)
238
+
239
+ elif sky_direction_scene == "LEFT":
240
+ try:
241
+ img = img.transpose(Image.Transpose.ROTATE_270)
242
+ except Exception:
243
+ img = img.transpose(Image.ROTATE_270)
244
+ depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
245
+
246
+ elif sky_direction_scene == "DOWN":
247
+ try:
248
+ img = img.transpose(Image.Transpose.ROTATE_180)
249
+ except Exception:
250
+ img = img.transpose(Image.ROTATE_180)
251
+ depth = cv2.rotate(depth, cv2.ROTATE_180)
252
+
253
+ W, H = img.size
254
+ if not osp.isfile(img_out):
255
+ img.save(img_out)
256
+
257
+ depth = cv2.resize(depth, (W, H), interpolation=cv2.INTER_NEAREST)
258
+ if not osp.isfile(
259
+ depth_out
260
+ ): # avoid destroying the base dataset when you mess up the paths
261
+ cv2.imwrite(depth_out, depth)
262
+
263
+ # save at the end
264
+ np.savez(
265
+ scene_metadata_path,
266
+ trajectories=trajectories,
267
+ intrinsics=intrinsics,
268
+ images=images,
269
+ )
270
+
271
+
272
+ def convert_scene_metadata(
273
+ scene_subdir,
274
+ intrinsics_dir,
275
+ timestamps,
276
+ quaternions,
277
+ poses,
278
+ poses_cam_to_world,
279
+ all_images,
280
+ selected_depths,
281
+ timestamps_selected,
282
+ ):
283
+ # find scene orientation
284
+ sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)
285
+
286
+ # find/compute pose for selected timestamps
287
+ # most images have a valid timestamp / exact pose associated
288
+ timestamps_selected = np.array(timestamps_selected)
289
+ spline = interpolate.interp1d(timestamps, poses, kind="linear", axis=0)
290
+ interpolated_rotations = quaternion.squad(
291
+ quaternions, timestamps, timestamps_selected
292
+ )
293
+ interpolated_positions = spline(timestamps_selected)
294
+
295
+ trajectories = []
296
+ intrinsics = []
297
+ images = []
298
+ depths = []
299
+ for i, (basename, frame_id) in enumerate(selected_depths):
300
+ intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
301
+ search_interval = int(0.1 / 0.001)
302
+ for timestamp in range(-search_interval, search_interval + 1):
303
+ if osp.exists(intrinsic_fn):
304
+ break
305
+ intrinsic_fn = osp.join(
306
+ intrinsics_dir,
307
+ f"{scene_subdir}_{float(frame_id) + timestamp * 0.001:.3f}.pincam",
308
+ )
309
+ if not osp.exists(intrinsic_fn):
310
+ print(f"Skipping {intrinsic_fn}")
311
+ continue
312
+
313
+ image_path = "{}_{}.png".format(scene_subdir, frame_id)
314
+ search_interval = int(0.001 / 0.001)
315
+ for timestamp in range(-search_interval, search_interval + 1):
316
+ if image_path in all_images:
317
+ break
318
+ image_path = "{}_{}.png".format(
319
+ scene_subdir, float(frame_id) + timestamp * 0.001
320
+ )
321
+ if image_path not in all_images:
322
+ print(f"Skipping {scene_subdir} {frame_id}")
323
+ continue
324
+
325
+ w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn) # PINHOLE
326
+
327
+ pose = np.eye(4)
328
+ pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
329
+ pose[:3, 3] = interpolated_positions[i]
330
+
331
+ images.append(basename)
332
+ depths.append(basename)
333
+ if sky_direction_scene == "RIGHT" or sky_direction_scene == "LEFT":
334
+ intrinsics.append([h, w, fy, fx, hh, hw]) # swapped intrinsics
335
+ else:
336
+ intrinsics.append([w, h, fx, fy, hw, hh])
337
+ trajectories.append(
338
+ pose @ rotated_to_cam
339
+ ) # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world
340
+
341
+ return sky_direction_scene, trajectories, intrinsics, images, depths
342
+
343
+
344
+ def find_scene_orientation(poses_cam_to_world):
345
+ if len(poses_cam_to_world) > 0:
346
+ up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(
347
+ poses_cam_to_world
348
+ )
349
+ right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(
350
+ poses_cam_to_world
351
+ )
352
+ up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
353
+ else:
354
+ up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
355
+ right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
356
+ up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
357
+
358
+ # value between 0, 180
359
+ device_up_to_world_up_angle = (
360
+ np.arccos(np.clip(np.dot(np.transpose(up_world), up_vector), -1.0, 1.0)).item()
361
+ * 180.0
362
+ / np.pi
363
+ )
364
+ device_right_to_world_up_angle = (
365
+ np.arccos(
366
+ np.clip(np.dot(np.transpose(up_world), right_vector), -1.0, 1.0)
367
+ ).item()
368
+ * 180.0
369
+ / np.pi
370
+ )
371
+
372
+ up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(
373
+ device_right_to_world_up_angle - 90.0
374
+ )
375
+ if up_closest_to_90:
376
+ assert abs(device_up_to_world_up_angle - 90.0) < 45.0
377
+ # LEFT
378
+ if device_right_to_world_up_angle > 90.0:
379
+ sky_direction_scene = "LEFT"
380
+ cam_to_rotated_q = quaternion.from_rotation_vector(
381
+ [0.0, 0.0, math.pi / 2.0]
382
+ )
383
+ else:
384
+ # note that in metadata.csv RIGHT does not exist, but again it's not accurate...
385
+ # well, turns out there are scenes oriented like this
386
+ # for example Training/41124801
387
+ sky_direction_scene = "RIGHT"
388
+ cam_to_rotated_q = quaternion.from_rotation_vector(
389
+ [0.0, 0.0, -math.pi / 2.0]
390
+ )
391
+ else:
392
+ # right is close to 90
393
+ assert abs(device_right_to_world_up_angle - 90.0) < 45.0
394
+ if device_up_to_world_up_angle > 90.0:
395
+ sky_direction_scene = "DOWN"
396
+ cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
397
+ else:
398
+ sky_direction_scene = "UP"
399
+ cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
400
+ cam_to_rotated = np.eye(4)
401
+ cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
402
+ rotated_to_cam = np.linalg.inv(cam_to_rotated)
403
+ return sky_direction_scene, rotated_to_cam
404
+
405
+
406
+ if __name__ == "__main__":
407
+ parser = get_parser()
408
+ args = parser.parse_args()
409
+ main(args.arkitscenes_dir, args.output_dir)
extern/CUT3R/datasets_preprocess/preprocess_bedlam.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Process Bedlam scenes by computing camera intrinsics and extrinsics
4
+ from extracted data. The script reads per-scene CSV and image/depth files,
5
+ computes the necessary camera parameters, and saves the resulting camera
6
+ files (as .npz files) in an output directory.
7
+
8
+ Usage:
9
+ python preprocess_bedlam.py --root /path/to/extracted_data \
10
+ --outdir /path/to/processed_bedlam \
11
+ [--num_workers 4]
12
+ """
13
+
14
+ import os
15
+ import cv2
16
+ import numpy as np
17
+ import pandas as pd
18
+ from glob import glob
19
+ import shutil
20
+ import OpenEXR # Ensure OpenEXR is installed
21
+ from concurrent.futures import ProcessPoolExecutor, as_completed
22
+ from tqdm import tqdm
23
+ import argparse
24
+
25
+ # Enable OpenEXR support in OpenCV.
26
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
27
+
28
+ # Global constants
29
+ IMG_FORMAT = ".png"
30
+ rotate_flag = False
31
+ SENSOR_W = 36
32
+ SENSOR_H = 20.25
33
+ IMG_W = 1280
34
+ IMG_H = 720
35
+
36
+ # -----------------------------------------------------------------------------
37
+ # Helper functions for camera parameter conversion
38
+ # -----------------------------------------------------------------------------
39
+
40
+
41
+ def focalLength_mm2px(focalLength, dslr_sens, focalPoint):
42
+ focal_pixel = (focalLength / dslr_sens) * focalPoint * 2
43
+ return focal_pixel
44
+
45
+
46
+ def get_cam_int(fl, sens_w, sens_h, cx, cy):
47
+ flx = focalLength_mm2px(fl, sens_w, cx)
48
+ fly = focalLength_mm2px(fl, sens_h, cy)
49
+ cam_mat = np.array([[flx, 0, cx], [0, fly, cy], [0, 0, 1]])
50
+ return cam_mat
51
+
52
+
53
+ def unreal2cv2(points):
54
+ # Permute coordinates: x --> y, y --> z, z --> x
55
+ points = np.roll(points, 2, axis=1)
56
+ # Invert the y-axis
57
+ points = points * np.array([1.0, -1.0, 1.0])
58
+ return points
59
+
60
+
61
+ def get_cam_trans(body_trans, cam_trans):
62
+ cam_trans = np.array(cam_trans) / 100
63
+ cam_trans = unreal2cv2(np.reshape(cam_trans, (1, 3)))
64
+ body_trans = np.array(body_trans) / 100
65
+ body_trans = unreal2cv2(np.reshape(body_trans, (1, 3)))
66
+ trans = body_trans - cam_trans
67
+ return trans
68
+
69
+
70
+ def get_cam_rotmat(pitch, yaw, roll):
71
+ rotmat_yaw, _ = cv2.Rodrigues(np.array([[0, (yaw / 180) * np.pi, 0]], dtype=float))
72
+ rotmat_pitch, _ = cv2.Rodrigues(np.array([pitch / 180 * np.pi, 0, 0]).reshape(3, 1))
73
+ rotmat_roll, _ = cv2.Rodrigues(np.array([0, 0, roll / 180 * np.pi]).reshape(3, 1))
74
+ final_rotmat = rotmat_roll @ (rotmat_pitch @ rotmat_yaw)
75
+ return final_rotmat
76
+
77
+
78
+ def get_global_orient(cam_pitch, cam_yaw, cam_roll):
79
+ pitch_rotmat, _ = cv2.Rodrigues(
80
+ np.array([cam_pitch / 180 * np.pi, 0, 0]).reshape(3, 1)
81
+ )
82
+ roll_rotmat, _ = cv2.Rodrigues(
83
+ np.array([0, 0, cam_roll / 180 * np.pi]).reshape(3, 1)
84
+ )
85
+ final_rotmat = roll_rotmat @ pitch_rotmat
86
+ return final_rotmat
87
+
88
+
89
+ def convert_translation_to_opencv(x, y, z):
90
+ t_cv = np.array([y, -z, x])
91
+ return t_cv
92
+
93
+
94
+ def rotation_matrix_unreal(yaw, pitch, roll):
95
+ yaw_rad = np.deg2rad(yaw)
96
+ pitch_rad = np.deg2rad(pitch)
97
+ roll_rad = np.deg2rad(roll)
98
+ # Yaw (left-handed)
99
+ R_yaw = np.array(
100
+ [
101
+ [np.cos(-yaw_rad), -np.sin(-yaw_rad), 0],
102
+ [np.sin(-yaw_rad), np.cos(-yaw_rad), 0],
103
+ [0, 0, 1],
104
+ ]
105
+ )
106
+ # Pitch (right-handed)
107
+ R_pitch = np.array(
108
+ [
109
+ [np.cos(pitch_rad), 0, np.sin(pitch_rad)],
110
+ [0, 1, 0],
111
+ [-np.sin(pitch_rad), 0, np.cos(pitch_rad)],
112
+ ]
113
+ )
114
+ # Roll (right-handed)
115
+ R_roll = np.array(
116
+ [
117
+ [1, 0, 0],
118
+ [0, np.cos(roll_rad), -np.sin(roll_rad)],
119
+ [0, np.sin(roll_rad), np.cos(roll_rad)],
120
+ ]
121
+ )
122
+ R_unreal = R_roll @ R_pitch @ R_yaw
123
+ return R_unreal
124
+
125
+
126
+ def convert_rotation_to_opencv(R_unreal):
127
+ # Transformation matrix from Unreal to OpenCV coordinate system.
128
+ C = np.array([[0, 1, 0], [0, 0, -1], [1, 0, 0]])
129
+ R_cv = C @ R_unreal @ C.T
130
+ return R_cv
131
+
132
+
133
+ def get_rot_unreal(yaw, pitch, roll):
134
+ yaw_rad = np.deg2rad(yaw)
135
+ pitch_rad = np.deg2rad(pitch)
136
+ roll_rad = np.deg2rad(roll)
137
+ R_yaw = np.array(
138
+ [
139
+ [np.cos(yaw_rad), -np.sin(yaw_rad), 0],
140
+ [np.sin(yaw_rad), np.cos(yaw_rad), 0],
141
+ [0, 0, 1],
142
+ ]
143
+ )
144
+ R_pitch = np.array(
145
+ [
146
+ [np.cos(pitch_rad), 0, -np.sin(pitch_rad)],
147
+ [0, 1, 0],
148
+ [np.sin(pitch_rad), 0, np.cos(pitch_rad)],
149
+ ]
150
+ )
151
+ R_roll = np.array(
152
+ [
153
+ [1, 0, 0],
154
+ [0, np.cos(roll_rad), np.sin(roll_rad)],
155
+ [0, -np.sin(roll_rad), np.cos(roll_rad)],
156
+ ]
157
+ )
158
+ R_unreal = R_yaw @ R_pitch @ R_roll
159
+ return R_unreal
160
+
161
+
162
+ def get_extrinsics_unreal(R_unreal, t_unreal):
163
+ cam_trans = np.array(t_unreal)
164
+ ext = np.eye(4)
165
+ ext[:3, :3] = R_unreal
166
+ ext[:3, 3] = cam_trans.reshape(1, 3)
167
+ return ext
168
+
169
+
170
+ def get_extrinsics_opencv(yaw, pitch, roll, x, y, z):
171
+ R_unreal = get_rot_unreal(yaw, pitch, roll)
172
+ t_unreal = np.array([x / 100.0, y / 100.0, z / 100.0])
173
+ T_u2wu = get_extrinsics_unreal(R_unreal, t_unreal)
174
+ T_opencv2unreal = np.array(
175
+ [[0, 0, -1, 0], [1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]], dtype=np.float32
176
+ )
177
+ T_wu2ou = np.array(
178
+ [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32
179
+ )
180
+ return np.linalg.inv(T_opencv2unreal @ T_u2wu @ T_wu2ou)
181
+
182
+
183
+ # -----------------------------------------------------------------------------
184
+ # Get camera parameters from the extracted images and CSV data.
185
+ # -----------------------------------------------------------------------------
186
+
187
+
188
+ def get_params(
189
+ image_folder,
190
+ fl,
191
+ trans_body,
192
+ cam_x,
193
+ cam_y,
194
+ cam_z,
195
+ fps,
196
+ cam_pitch_,
197
+ cam_roll_,
198
+ cam_yaw_,
199
+ ):
200
+ all_images = sorted(glob(os.path.join(image_folder, "*" + IMG_FORMAT)))
201
+ imgnames, cam_ext, cam_int = [], [], []
202
+
203
+ for img_ind, image_path in enumerate(all_images):
204
+ # Process every 5th frame.
205
+ if img_ind % 5 != 0:
206
+ continue
207
+ cam_ind = img_ind
208
+
209
+ cam_pitch_ind = cam_pitch_[cam_ind]
210
+ cam_yaw_ind = cam_yaw_[cam_ind]
211
+ cam_roll_ind = cam_roll_[cam_ind]
212
+
213
+ CAM_INT = get_cam_int(fl[cam_ind], SENSOR_W, SENSOR_H, IMG_W / 2.0, IMG_H / 2.0)
214
+
215
+ rot_unreal = rotation_matrix_unreal(cam_yaw_ind, cam_pitch_ind, cam_roll_ind)
216
+ rot_cv = convert_rotation_to_opencv(rot_unreal)
217
+ trans_cv = convert_translation_to_opencv(
218
+ cam_x[cam_ind] / 100.0, cam_y[cam_ind] / 100.0, cam_z[cam_ind] / 100.0
219
+ )
220
+ cam_ext_ = np.eye(4)
221
+ cam_ext_[:3, :3] = rot_cv
222
+ # The camera pose is computed as the inverse of the transformed translation.
223
+ cam_ext_[:3, 3] = -rot_cv @ trans_cv
224
+
225
+ imgnames.append(
226
+ os.path.join(image_path.split("/")[-2], image_path.split("/")[-1])
227
+ )
228
+ cam_ext.append(cam_ext_)
229
+ cam_int.append(CAM_INT)
230
+ return imgnames, cam_ext, cam_int
231
+
232
+
233
+ # -----------------------------------------------------------------------------
234
+ # Processing per sequence.
235
+ # -----------------------------------------------------------------------------
236
+
237
+
238
+ def process_seq(args):
239
+ """
240
+ Process a single sequence task. For each image, load the corresponding
241
+ depth and image files, and save the computed camera intrinsics and the inverse
242
+ of the extrinsic matrix (i.e. the camera pose in world coordinates) as an NPZ file.
243
+ """
244
+ (
245
+ scene,
246
+ seq_name,
247
+ outdir,
248
+ image_folder_base,
249
+ depth_folder_base,
250
+ imgnames,
251
+ cam_ext,
252
+ cam_int,
253
+ ) = args
254
+
255
+ out_rgb_dir = os.path.join(outdir, '_'.join([scene, seq_name]), 'rgb')
256
+ out_depth_dir = os.path.join(outdir, '_'.join([scene, seq_name]), 'depth')
257
+ out_cam_dir = os.path.join(outdir, "_".join([scene, seq_name]), "cam")
258
+ os.makedirs(out_rgb_dir, exist_ok=True)
259
+ os.makedirs(out_depth_dir, exist_ok=True)
260
+ os.makedirs(out_cam_dir, exist_ok=True)
261
+
262
+ assert (
263
+ len(imgnames) == len(cam_ext) == len(cam_int)
264
+ ), f"Inconsistent lengths for {scene}_{seq_name}"
265
+ for imgname, ext, intr in zip(imgnames, cam_ext, cam_int):
266
+ depthname = imgname.replace(".png", "_depth.exr")
267
+ imgpath = os.path.join(image_folder_base, imgname)
268
+ depthpath = os.path.join(depth_folder_base, depthname)
269
+ depth= OpenEXR.File(depthpath).parts[0].channels['Depth'].pixels
270
+ depth = depth.astype(np.float32)/100.0
271
+
272
+ outimg_path = os.path.join(out_rgb_dir, os.path.basename(imgpath))
273
+ outdepth_path = os.path.join(out_depth_dir, os.path.basename(imgpath).replace('.png','.npy'))
274
+ outcam_path = os.path.join(
275
+ out_cam_dir, os.path.basename(imgpath).replace(".png", ".npz")
276
+ )
277
+
278
+ shutil.copy(imgpath, outimg_path)
279
+ np.save(outdepth_path, depth)
280
+ np.savez(outcam_path, intrinsics=intr, pose=np.linalg.inv(ext))
281
+ return None
282
+
283
+
284
+ # -----------------------------------------------------------------------------
285
+ # Main entry point.
286
+ # -----------------------------------------------------------------------------
287
+
288
+
289
+ def main():
290
+ parser = argparse.ArgumentParser(
291
+ description="Process Bedlam scenes: compute camera intrinsics and extrinsics, "
292
+ "and save processed camera files."
293
+ )
294
+ parser.add_argument(
295
+ "--root",
296
+ type=str,
297
+ required=True,
298
+ help="Root directory of the extracted data (scenes).",
299
+ )
300
+ parser.add_argument(
301
+ "--outdir", type=str, required=True, help="Output directory for processed data."
302
+ )
303
+ parser.add_argument(
304
+ "--num_workers",
305
+ type=int,
306
+ default=None,
307
+ help="Number of worker processes (default: os.cpu_count()//2).",
308
+ )
309
+ args = parser.parse_args()
310
+
311
+ root = args.root
312
+ outdir = args.outdir
313
+ num_workers = (
314
+ args.num_workers if args.num_workers is not None else (os.cpu_count() or 4) // 2
315
+ )
316
+
317
+ # Get scene directories from the root folder.
318
+ scenes = sorted(
319
+ [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
320
+ )
321
+ # Exclude HDRI scenes.
322
+ hdri_scenes = [
323
+ "20221010_3_1000_batch01hand",
324
+ "20221017_3_1000_batch01hand",
325
+ "20221018_3-8_250_batch01hand",
326
+ "20221019_3_250_highbmihand",
327
+ ]
328
+ scenes = np.setdiff1d(scenes, hdri_scenes)
329
+
330
+ tasks = []
331
+ for scene in tqdm(scenes, desc="Collecting tasks"):
332
+ # Skip closeup scenes.
333
+ if "closeup" in scene:
334
+ continue
335
+ base_folder = os.path.join(root, scene)
336
+ image_folder_base = os.path.join(root, scene, "png")
337
+ depth_folder_base = os.path.join(root, scene, "depth")
338
+ csv_path = os.path.join(base_folder, "be_seq.csv")
339
+ if not os.path.exists(csv_path):
340
+ continue
341
+ csv_data = pd.read_csv(csv_path)
342
+ csv_data = csv_data.to_dict("list")
343
+ cam_csv_base = os.path.join(base_folder, "ground_truth", "camera")
344
+
345
+ # Look for a row in the CSV with a "sequence_name" comment.
346
+ for idx, comment in enumerate(csv_data.get("Comment", [])):
347
+ if "sequence_name" in comment:
348
+ seq_name = comment.split(";")[0].split("=")[-1]
349
+ cam_csv_path = os.path.join(cam_csv_base, seq_name + "_camera.csv")
350
+ if not os.path.exists(cam_csv_path):
351
+ continue
352
+ cam_csv_data = pd.read_csv(cam_csv_path)
353
+ cam_csv_data = cam_csv_data.to_dict("list")
354
+ cam_x = cam_csv_data["x"]
355
+ cam_y = cam_csv_data["y"]
356
+ cam_z = cam_csv_data["z"]
357
+ cam_yaw_ = cam_csv_data["yaw"]
358
+ cam_pitch_ = cam_csv_data["pitch"]
359
+ cam_roll_ = cam_csv_data["roll"]
360
+ fl = cam_csv_data["focal_length"]
361
+ image_folder = os.path.join(image_folder_base, seq_name)
362
+ trans_body = None # Not used here.
363
+ imgnames, cam_ext, cam_int = get_params(
364
+ image_folder,
365
+ fl,
366
+ trans_body,
367
+ cam_x,
368
+ cam_y,
369
+ cam_z,
370
+ 6,
371
+ cam_pitch_=cam_pitch_,
372
+ cam_roll_=cam_roll_,
373
+ cam_yaw_=cam_yaw_,
374
+ )
375
+ tasks.append(
376
+ (
377
+ scene,
378
+ seq_name,
379
+ outdir,
380
+ image_folder_base,
381
+ depth_folder_base,
382
+ imgnames,
383
+ cam_ext,
384
+ cam_int,
385
+ )
386
+ )
387
+ # Process only the first valid sequence for this scene.
388
+ break
389
+
390
+ # Process each task in parallel.
391
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
392
+ futures = {executor.submit(process_seq, task): task for task in tasks}
393
+ for future in tqdm(
394
+ as_completed(futures), total=len(futures), desc="Processing sequences"
395
+ ):
396
+ error = future.result()
397
+ if error:
398
+ print(error)
399
+
400
+
401
+ if __name__ == "__main__":
402
+ main()
extern/CUT3R/datasets_preprocess/preprocess_blendedmvs.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
3
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
4
+ #
5
+ # --------------------------------------------------------
6
+ # Preprocessing code for the BlendedMVS dataset
7
+ # dataset at https://github.com/YoYo000/BlendedMVS
8
+ # 1) Download BlendedMVS.zip
9
+ # 2) Download BlendedMVS+.zip
10
+ # 3) Download BlendedMVS++.zip
11
+ # 4) Unzip everything in the same /path/to/tmp/blendedMVS/ directory
12
+ # 5) python datasets_preprocess/preprocess_blendedMVS.py --blendedmvs_dir /path/to/tmp/blendedMVS/
13
+ # --------------------------------------------------------
14
+ import os
15
+ import os.path as osp
16
+ import re
17
+ from tqdm import tqdm
18
+ import numpy as np
19
+
20
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
21
+ import cv2
22
+
23
+ import path_to_root # noqa
24
+ from datasets_preprocess.utils.parallel import parallel_threads
25
+ from datasets_preprocess.utils import cropping # noqa
26
+
27
+
28
+ def get_parser():
29
+ import argparse
30
+
31
+ parser = argparse.ArgumentParser()
32
+ parser.add_argument("--blendedmvs_dir", required=True)
33
+ parser.add_argument("--precomputed_pairs", required=True)
34
+ parser.add_argument("--output_dir", default="data/blendedmvs_processed")
35
+ return parser
36
+
37
+
38
+ def main(db_root, pairs_path, output_dir):
39
+ print(">> Listing all sequences")
40
+ sequences = [f for f in os.listdir(db_root) if len(f) == 24]
41
+ # should find 502 scenes
42
+ assert sequences, f"did not found any sequences at {db_root}"
43
+ print(f" (found {len(sequences)} sequences)")
44
+
45
+ for i, seq in enumerate(tqdm(sequences)):
46
+ out_dir = osp.join(output_dir, seq)
47
+ os.makedirs(out_dir, exist_ok=True)
48
+
49
+ # generate the crops
50
+ root = osp.join(db_root, seq)
51
+ cam_dir = osp.join(root, "cams")
52
+ func_args = [
53
+ (root, f[:-8], out_dir)
54
+ for f in os.listdir(cam_dir)
55
+ if not f.startswith("pair")
56
+ ]
57
+ parallel_threads(load_crop_and_save, func_args, star_args=True, leave=False)
58
+
59
+ # verify that all pairs are there
60
+ pairs = np.load(pairs_path)
61
+ for seqh, seql, img1, img2, score in tqdm(pairs):
62
+ for view_index in [img1, img2]:
63
+ impath = osp.join(
64
+ output_dir, f"{seqh:08x}{seql:016x}", f"{view_index:08n}.jpg"
65
+ )
66
+ assert osp.isfile(impath), f"missing image at {impath=}"
67
+
68
+ print(f">> Done, saved everything in {output_dir}/")
69
+
70
+
71
+ def load_crop_and_save(root, img, out_dir):
72
+ if osp.isfile(osp.join(out_dir, img + ".npz")):
73
+ return # already done
74
+
75
+ # load everything
76
+ intrinsics_in, R_camin2world, t_camin2world = _load_pose(
77
+ osp.join(root, "cams", img + "_cam.txt")
78
+ )
79
+ color_image_in = cv2.cvtColor(
80
+ cv2.imread(osp.join(root, "blended_images", img + ".jpg"), cv2.IMREAD_COLOR),
81
+ cv2.COLOR_BGR2RGB,
82
+ )
83
+ depthmap_in = load_pfm_file(osp.join(root, "rendered_depth_maps", img + ".pfm"))
84
+
85
+ # do the crop
86
+ H, W = color_image_in.shape[:2]
87
+ assert H * 4 == W * 3
88
+ image, depthmap, intrinsics_out, R_in2out = _crop_image(
89
+ intrinsics_in, color_image_in, depthmap_in, (512, 384)
90
+ )
91
+
92
+ # write everything
93
+ image.save(osp.join(out_dir, img + ".jpg"), quality=80)
94
+ cv2.imwrite(osp.join(out_dir, img + ".exr"), depthmap)
95
+
96
+ # New camera parameters
97
+ R_camout2world = R_camin2world @ R_in2out.T
98
+ t_camout2world = t_camin2world
99
+ np.savez(
100
+ osp.join(out_dir, img + ".npz"),
101
+ intrinsics=intrinsics_out,
102
+ R_cam2world=R_camout2world,
103
+ t_cam2world=t_camout2world,
104
+ )
105
+
106
+
107
+ def _crop_image(intrinsics_in, color_image_in, depthmap_in, resolution_out=(800, 800)):
108
+ image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
109
+ color_image_in, depthmap_in, intrinsics_in, resolution_out
110
+ )
111
+ R_in2out = np.eye(3)
112
+ return image, depthmap, intrinsics_out, R_in2out
113
+
114
+
115
+ def _load_pose(path, ret_44=False):
116
+ f = open(path)
117
+ RT = np.loadtxt(f, skiprows=1, max_rows=4, dtype=np.float32)
118
+ assert RT.shape == (4, 4)
119
+ RT = np.linalg.inv(RT) # world2cam to cam2world
120
+
121
+ K = np.loadtxt(f, skiprows=2, max_rows=3, dtype=np.float32)
122
+ assert K.shape == (3, 3)
123
+
124
+ if ret_44:
125
+ return K, RT
126
+ return K, RT[:3, :3], RT[:3, 3] # , depth_uint8_to_f32
127
+
128
+
129
+ def load_pfm_file(file_path):
130
+ with open(file_path, "rb") as file:
131
+ header = file.readline().decode("UTF-8").strip()
132
+
133
+ if header == "PF":
134
+ is_color = True
135
+ elif header == "Pf":
136
+ is_color = False
137
+ else:
138
+ raise ValueError("The provided file is not a valid PFM file.")
139
+
140
+ dimensions = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("UTF-8"))
141
+ if dimensions:
142
+ img_width, img_height = map(int, dimensions.groups())
143
+ else:
144
+ raise ValueError("Invalid PFM header format.")
145
+
146
+ endian_scale = float(file.readline().decode("UTF-8").strip())
147
+ if endian_scale < 0:
148
+ dtype = "<f" # little-endian
149
+ else:
150
+ dtype = ">f" # big-endian
151
+
152
+ data_buffer = file.read()
153
+ img_data = np.frombuffer(data_buffer, dtype=dtype)
154
+
155
+ if is_color:
156
+ img_data = np.reshape(img_data, (img_height, img_width, 3))
157
+ else:
158
+ img_data = np.reshape(img_data, (img_height, img_width))
159
+
160
+ img_data = cv2.flip(img_data, 0)
161
+
162
+ return img_data
163
+
164
+
165
+ if __name__ == "__main__":
166
+ parser = get_parser()
167
+ args = parser.parse_args()
168
+ main(args.blendedmvs_dir, args.precomputed_pairs, args.output_dir)
extern/CUT3R/datasets_preprocess/preprocess_co3d.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
3
+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
4
+ #
5
+ # --------------------------------------------------------
6
+ # Script to pre-process the CO3D dataset.
7
+ # Usage:
8
+ # python3 datasets_preprocess/preprocess_co3d.py --co3d_dir /path/to/co3d
9
+ # --------------------------------------------------------
10
+
11
+ import argparse
12
+ import random
13
+ import gzip
14
+ import json
15
+ import os
16
+ import os.path as osp
17
+
18
+ import torch
19
+ import PIL.Image
20
+ import numpy as np
21
+ import cv2
22
+
23
+ from tqdm.auto import tqdm
24
+ import matplotlib.pyplot as plt
25
+
26
+ import path_to_root # noqa
27
+ import datasets_preprocess.utils.cropping as cropping # noqa
28
+
29
+
30
+ CATEGORIES = [
31
+ "apple",
32
+ "backpack",
33
+ "ball",
34
+ "banana",
35
+ "baseballbat",
36
+ "baseballglove",
37
+ "bench",
38
+ "bicycle",
39
+ "book",
40
+ "bottle",
41
+ "bowl",
42
+ "broccoli",
43
+ "cake",
44
+ "car",
45
+ "carrot",
46
+ "cellphone",
47
+ "chair",
48
+ "couch",
49
+ "cup",
50
+ "donut",
51
+ "frisbee",
52
+ "hairdryer",
53
+ "handbag",
54
+ "hotdog",
55
+ "hydrant",
56
+ "keyboard",
57
+ "kite",
58
+ "laptop",
59
+ "microwave",
60
+ "motorcycle",
61
+ "mouse",
62
+ "orange",
63
+ "parkingmeter",
64
+ "pizza",
65
+ "plant",
66
+ "remote",
67
+ "sandwich",
68
+ "skateboard",
69
+ "stopsign",
70
+ "suitcase",
71
+ "teddybear",
72
+ "toaster",
73
+ "toilet",
74
+ "toybus",
75
+ "toyplane",
76
+ "toytrain",
77
+ "toytruck",
78
+ "tv",
79
+ "umbrella",
80
+ "vase",
81
+ "wineglass",
82
+ ]
83
+ CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)} # for seeding
84
+
85
+ SINGLE_SEQUENCE_CATEGORIES = sorted(
86
+ set(CATEGORIES) - set(["microwave", "stopsign", "tv"])
87
+ )
88
+
89
+
90
+ def get_parser():
91
+ parser = argparse.ArgumentParser()
92
+ parser.add_argument("--category", type=str, default=None)
93
+ parser.add_argument(
94
+ "--single_sequence_subset",
95
+ default=False,
96
+ action="store_true",
97
+ help="prepare the single_sequence_subset instead.",
98
+ )
99
+ parser.add_argument("--output_dir", type=str, default="data/co3d_processed")
100
+ parser.add_argument("--co3d_dir", type=str, required=True)
101
+ parser.add_argument("--num_sequences_per_object", type=int, default=50)
102
+ parser.add_argument("--seed", type=int, default=42)
103
+ parser.add_argument(
104
+ "--min_quality",
105
+ type=float,
106
+ default=0.5,
107
+ help="Minimum viewpoint quality score.",
108
+ )
109
+
110
+ parser.add_argument(
111
+ "--img_size",
112
+ type=int,
113
+ default=512,
114
+ help=(
115
+ "lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
116
+ ),
117
+ )
118
+ return parser
119
+
120
+
121
+ def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
122
+ focal_length = np.array(focal_length)
123
+ principal_point = np.array(principal_point)
124
+ image_size_wh = np.array([image_size[1], image_size[0]])
125
+ half_image_size = image_size_wh / 2
126
+ rescale = half_image_size.min()
127
+ principal_point_px = half_image_size - principal_point * rescale
128
+ focal_length_px = focal_length * rescale
129
+ fx, fy = focal_length_px[0], focal_length_px[1]
130
+ cx, cy = principal_point_px[0], principal_point_px[1]
131
+ K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
132
+ return K
133
+
134
+
135
+ def opencv_from_cameras_projection(R, T, focal, p0, image_size):
136
+ R = torch.from_numpy(R)[None, :, :]
137
+ T = torch.from_numpy(T)[None, :]
138
+ focal = torch.from_numpy(focal)[None, :]
139
+ p0 = torch.from_numpy(p0)[None, :]
140
+ image_size = torch.from_numpy(image_size)[None, :]
141
+
142
+ R_pytorch3d = R.clone()
143
+ T_pytorch3d = T.clone()
144
+ focal_pytorch3d = focal
145
+ p0_pytorch3d = p0
146
+ T_pytorch3d[:, :2] *= -1
147
+ R_pytorch3d[:, :, :2] *= -1
148
+ tvec = T_pytorch3d
149
+ R = R_pytorch3d.permute(0, 2, 1)
150
+
151
+ # Retype the image_size correctly and flip to width, height.
152
+ image_size_wh = image_size.to(R).flip(dims=(1,))
153
+
154
+ # NDC to screen conversion.
155
+ scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
156
+ scale = scale.expand(-1, 2)
157
+ c0 = image_size_wh / 2.0
158
+
159
+ principal_point = -p0_pytorch3d * scale + c0
160
+ focal_length = focal_pytorch3d * scale
161
+
162
+ camera_matrix = torch.zeros_like(R)
163
+ camera_matrix[:, :2, 2] = principal_point
164
+ camera_matrix[:, 2, 2] = 1.0
165
+ camera_matrix[:, 0, 0] = focal_length[:, 0]
166
+ camera_matrix[:, 1, 1] = focal_length[:, 1]
167
+ return R[0], tvec[0], camera_matrix[0]
168
+
169
+
170
+ def get_set_list(category_dir, split, is_single_sequence_subset=False):
171
+ listfiles = os.listdir(osp.join(category_dir, "set_lists"))
172
+ if is_single_sequence_subset:
173
+ # not all objects have manyview_dev
174
+ subset_list_files = [f for f in listfiles if "manyview_dev" in f]
175
+ else:
176
+ subset_list_files = [f for f in listfiles if f"fewview_train" in f]
177
+
178
+ sequences_all = []
179
+ for subset_list_file in subset_list_files:
180
+ with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
181
+ subset_lists_data = json.load(f)
182
+ sequences_all.extend(subset_lists_data[split])
183
+
184
+ return sequences_all
185
+
186
+
187
+ def prepare_sequences(
188
+ category,
189
+ co3d_dir,
190
+ output_dir,
191
+ img_size,
192
+ split,
193
+ min_quality,
194
+ max_num_sequences_per_object,
195
+ seed,
196
+ is_single_sequence_subset=False,
197
+ ):
198
+ random.seed(seed)
199
+ category_dir = osp.join(co3d_dir, category)
200
+ category_output_dir = osp.join(output_dir, category)
201
+ sequences_all = get_set_list(category_dir, split, is_single_sequence_subset)
202
+ sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))
203
+
204
+ frame_file = osp.join(category_dir, "frame_annotations.jgz")
205
+ sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
206
+
207
+ with gzip.open(frame_file, "r") as fin:
208
+ frame_data = json.loads(fin.read())
209
+ with gzip.open(sequence_file, "r") as fin:
210
+ sequence_data = json.loads(fin.read())
211
+
212
+ frame_data_processed = {}
213
+ for f_data in frame_data:
214
+ sequence_name = f_data["sequence_name"]
215
+ frame_data_processed.setdefault(sequence_name, {})[
216
+ f_data["frame_number"]
217
+ ] = f_data
218
+
219
+ good_quality_sequences = set()
220
+ for seq_data in sequence_data:
221
+ if seq_data["viewpoint_quality_score"] > min_quality:
222
+ good_quality_sequences.add(seq_data["sequence_name"])
223
+
224
+ sequences_numbers = [
225
+ seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences
226
+ ]
227
+ if len(sequences_numbers) < max_num_sequences_per_object:
228
+ selected_sequences_numbers = sequences_numbers
229
+ else:
230
+ selected_sequences_numbers = random.sample(
231
+ sequences_numbers, max_num_sequences_per_object
232
+ )
233
+
234
+ selected_sequences_numbers_dict = {
235
+ seq_name: [] for seq_name in selected_sequences_numbers
236
+ }
237
+ sequences_all = [
238
+ (seq_name, frame_number, filepath)
239
+ for seq_name, frame_number, filepath in sequences_all
240
+ if seq_name in selected_sequences_numbers_dict
241
+ ]
242
+
243
+ for seq_name, frame_number, filepath in tqdm(sequences_all):
244
+ frame_idx = int(filepath.split("/")[-1][5:-4])
245
+ selected_sequences_numbers_dict[seq_name].append(frame_idx)
246
+ mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
247
+ frame_data = frame_data_processed[seq_name][frame_number]
248
+ focal_length = frame_data["viewpoint"]["focal_length"]
249
+ principal_point = frame_data["viewpoint"]["principal_point"]
250
+ image_size = frame_data["image"]["size"]
251
+ K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
252
+ R, tvec, camera_intrinsics = opencv_from_cameras_projection(
253
+ np.array(frame_data["viewpoint"]["R"]),
254
+ np.array(frame_data["viewpoint"]["T"]),
255
+ np.array(focal_length),
256
+ np.array(principal_point),
257
+ np.array(image_size),
258
+ )
259
+
260
+ frame_data = frame_data_processed[seq_name][frame_number]
261
+ depth_path = os.path.join(co3d_dir, frame_data["depth"]["path"])
262
+ assert frame_data["depth"]["scale_adjustment"] == 1.0
263
+ image_path = os.path.join(co3d_dir, filepath)
264
+ mask_path_full = os.path.join(co3d_dir, mask_path)
265
+
266
+ input_rgb_image = PIL.Image.open(image_path).convert("RGB")
267
+ input_mask = plt.imread(mask_path_full)
268
+
269
+ with PIL.Image.open(depth_path) as depth_pil:
270
+ # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
271
+ # we cast it to uint16, then reinterpret as float16, then cast to float32
272
+ input_depthmap = (
273
+ np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
274
+ .astype(np.float32)
275
+ .reshape((depth_pil.size[1], depth_pil.size[0]))
276
+ )
277
+ depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
278
+ H, W = input_depthmap.shape
279
+
280
+ camera_intrinsics = camera_intrinsics.numpy()
281
+ cx, cy = camera_intrinsics[:2, 2].round().astype(int)
282
+ min_margin_x = min(cx, W - cx)
283
+ min_margin_y = min(cy, H - cy)
284
+
285
+ # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
286
+ l, t = cx - min_margin_x, cy - min_margin_y
287
+ r, b = cx + min_margin_x, cy + min_margin_y
288
+ crop_bbox = (l, t, r, b)
289
+ input_rgb_image, depth_mask, input_camera_intrinsics = (
290
+ cropping.crop_image_depthmap(
291
+ input_rgb_image, depth_mask, camera_intrinsics, crop_bbox
292
+ )
293
+ )
294
+
295
+ # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
296
+ scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
297
+ output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
298
+ if max(output_resolution) < img_size:
299
+ # let's put the max dimension to img_size
300
+ scale_final = (img_size / max(H, W)) + 1e-8
301
+ output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
302
+
303
+ input_rgb_image, depth_mask, input_camera_intrinsics = (
304
+ cropping.rescale_image_depthmap(
305
+ input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution
306
+ )
307
+ )
308
+ input_depthmap = depth_mask[:, :, 0]
309
+ input_mask = depth_mask[:, :, 1]
310
+
311
+ # generate and adjust camera pose
312
+ camera_pose = np.eye(4, dtype=np.float32)
313
+ camera_pose[:3, :3] = R
314
+ camera_pose[:3, 3] = tvec
315
+ camera_pose = np.linalg.inv(camera_pose)
316
+
317
+ # save crop images and depth, metadata
318
+ save_img_path = os.path.join(output_dir, filepath)
319
+ save_depth_path = os.path.join(output_dir, frame_data["depth"]["path"])
320
+ save_mask_path = os.path.join(output_dir, mask_path)
321
+ os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
322
+ os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
323
+ os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)
324
+
325
+ input_rgb_image.save(save_img_path)
326
+ scaled_depth_map = (input_depthmap / np.max(input_depthmap) * 65535).astype(
327
+ np.uint16
328
+ )
329
+ cv2.imwrite(save_depth_path, scaled_depth_map)
330
+ cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
331
+
332
+ save_meta_path = save_img_path.replace("jpg", "npz")
333
+ np.savez(
334
+ save_meta_path,
335
+ camera_intrinsics=input_camera_intrinsics,
336
+ camera_pose=camera_pose,
337
+ maximum_depth=np.max(input_depthmap),
338
+ )
339
+
340
+ return selected_sequences_numbers_dict
341
+
342
+
343
+ if __name__ == "__main__":
344
+ parser = get_parser()
345
+ args = parser.parse_args()
346
+ assert args.co3d_dir != args.output_dir
347
+ if args.category is None:
348
+ if args.single_sequence_subset:
349
+ categories = SINGLE_SEQUENCE_CATEGORIES
350
+ else:
351
+ categories = CATEGORIES
352
+ else:
353
+ categories = [args.category]
354
+ os.makedirs(args.output_dir, exist_ok=True)
355
+
356
+ for split in ["train", "test"]:
357
+ selected_sequences_path = os.path.join(
358
+ args.output_dir, f"selected_seqs_{split}.json"
359
+ )
360
+ if os.path.isfile(selected_sequences_path):
361
+ continue
362
+
363
+ all_selected_sequences = {}
364
+ for category in categories:
365
+ category_output_dir = osp.join(args.output_dir, category)
366
+ os.makedirs(category_output_dir, exist_ok=True)
367
+ category_selected_sequences_path = os.path.join(
368
+ category_output_dir, f"selected_seqs_{split}.json"
369
+ )
370
+ if os.path.isfile(category_selected_sequences_path):
371
+ with open(category_selected_sequences_path, "r") as fid:
372
+ category_selected_sequences = json.load(fid)
373
+ else:
374
+ print(f"Processing {split} - category = {category}")
375
+ category_selected_sequences = prepare_sequences(
376
+ category=category,
377
+ co3d_dir=args.co3d_dir,
378
+ output_dir=args.output_dir,
379
+ img_size=args.img_size,
380
+ split=split,
381
+ min_quality=args.min_quality,
382
+ max_num_sequences_per_object=args.num_sequences_per_object,
383
+ seed=args.seed + CATEGORIES_IDX[category],
384
+ is_single_sequence_subset=args.single_sequence_subset,
385
+ )
386
+ with open(category_selected_sequences_path, "w") as file:
387
+ json.dump(category_selected_sequences, file)
388
+
389
+ all_selected_sequences[category] = category_selected_sequences
390
+ with open(selected_sequences_path, "w") as file:
391
+ json.dump(all_selected_sequences, file)
extern/CUT3R/datasets_preprocess/preprocess_cop3d.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # --------------------------------------------------------
4
+ # Script to pre-process the COP3D dataset.
5
+ # Usage:
6
+ # python3 preprocess_cop3d.py --cop3d_dir /path/to/cop3d \
7
+ # --output_dir /path/to/processed_cop3d
8
+ # --------------------------------------------------------
9
+
10
+ import argparse
11
+ import random
12
+ import gzip
13
+ import json
14
+ import os
15
+ import os.path as osp
16
+
17
+ import torch
18
+ import PIL.Image
19
+ import numpy as np
20
+ import cv2
21
+
22
+ from tqdm.auto import tqdm
23
+ import matplotlib.pyplot as plt
24
+
25
+ import src.dust3r.datasets.utils.cropping as cropping
26
+
27
+ # Define the object categories. (These are used for seeding.)
28
+ CATEGORIES = ["cat", "dog"]
29
+ CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)}
30
+
31
+
32
+ def get_parser():
33
+ """Set up the argument parser."""
34
+ parser = argparse.ArgumentParser(
35
+ description="Preprocess the CO3D dataset and output processed images, masks, and metadata."
36
+ )
37
+ parser.add_argument(
38
+ "--output_dir",
39
+ type=str,
40
+ default="",
41
+ help="Output directory for processed CO3D data.",
42
+ )
43
+ parser.add_argument(
44
+ "--cop3d_dir",
45
+ type=str,
46
+ default="",
47
+ help="Directory containing the raw CO3D data.",
48
+ )
49
+ parser.add_argument(
50
+ "--seed", type=int, default=42, help="Random seed for reproducibility."
51
+ )
52
+ parser.add_argument(
53
+ "--min_quality",
54
+ type=float,
55
+ default=0.5,
56
+ help="Minimum viewpoint quality score.",
57
+ )
58
+ parser.add_argument(
59
+ "--img_size",
60
+ type=int,
61
+ default=512,
62
+ help=(
63
+ "Lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"
64
+ ),
65
+ )
66
+ return parser
67
+
68
+
69
+ def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
70
+ """Convert normalized device coordinates to a pinhole camera intrinsic matrix."""
71
+ focal_length = np.array(focal_length)
72
+ principal_point = np.array(principal_point)
73
+ image_size_wh = np.array([image_size[1], image_size[0]])
74
+ half_image_size = image_size_wh / 2
75
+ rescale = half_image_size.min()
76
+ principal_point_px = half_image_size - principal_point * rescale
77
+ focal_length_px = focal_length * rescale
78
+ fx, fy = focal_length_px[0], focal_length_px[1]
79
+ cx, cy = principal_point_px[0], principal_point_px[1]
80
+ K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
81
+ return K
82
+
83
+
84
+ def opencv_from_cameras_projection(R, T, focal, p0, image_size):
85
+ """
86
+ Convert camera projection parameters from CO3D (NDC) to OpenCV coordinates.
87
+
88
+ Returns:
89
+ R, tvec, camera_matrix: OpenCV-style rotation matrix, translation vector, and intrinsic matrix.
90
+ """
91
+ R = torch.from_numpy(R)[None, :, :]
92
+ T = torch.from_numpy(T)[None, :]
93
+ focal = torch.from_numpy(focal)[None, :]
94
+ p0 = torch.from_numpy(p0)[None, :]
95
+ image_size = torch.from_numpy(image_size)[None, :]
96
+
97
+ # Convert to PyTorch3D convention.
98
+ R_pytorch3d = R.clone()
99
+ T_pytorch3d = T.clone()
100
+ focal_pytorch3d = focal
101
+ p0_pytorch3d = p0
102
+ T_pytorch3d[:, :2] *= -1
103
+ R_pytorch3d[:, :, :2] *= -1
104
+ tvec = T_pytorch3d
105
+ R = R_pytorch3d.permute(0, 2, 1)
106
+
107
+ # Retype image_size (flip to width, height).
108
+ image_size_wh = image_size.to(R).flip(dims=(1,))
109
+
110
+ # Compute scale and principal point.
111
+ scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
112
+ scale = scale.expand(-1, 2)
113
+ c0 = image_size_wh / 2.0
114
+ principal_point = -p0_pytorch3d * scale + c0
115
+ focal_length = focal_pytorch3d * scale
116
+
117
+ camera_matrix = torch.zeros_like(R)
118
+ camera_matrix[:, :2, 2] = principal_point
119
+ camera_matrix[:, 2, 2] = 1.0
120
+ camera_matrix[:, 0, 0] = focal_length[:, 0]
121
+ camera_matrix[:, 1, 1] = focal_length[:, 1]
122
+ return R[0], tvec[0], camera_matrix[0]
123
+
124
+
125
+ def get_set_list(category_dir, split):
126
+ """Obtain a list of sequences for a given category and split."""
127
+ listfiles = os.listdir(osp.join(category_dir, "set_lists"))
128
+ subset_list_files = [f for f in listfiles if "manyview" in f]
129
+ if len(subset_list_files) <= 0:
130
+ subset_list_files = [f for f in listfiles if "fewview" in f]
131
+
132
+ sequences_all = []
133
+ for subset_list_file in subset_list_files:
134
+ with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
135
+ subset_lists_data = json.load(f)
136
+ sequences_all.extend(subset_lists_data[split])
137
+ return sequences_all
138
+
139
+
140
+ def prepare_sequences(
141
+ category, cop3d_dir, output_dir, img_size, split, min_quality, seed
142
+ ):
143
+ """
144
+ Process sequences for a given category and split.
145
+
146
+ This function loads per-frame and per-sequence annotations,
147
+ filters sequences based on quality, crops and rescales images,
148
+ and saves metadata for each frame.
149
+
150
+ Returns a dictionary mapping sequence names to lists of selected frame indices.
151
+ """
152
+ random.seed(seed)
153
+ category_dir = osp.join(cop3d_dir, category)
154
+ category_output_dir = osp.join(output_dir, category)
155
+ sequences_all = get_set_list(category_dir, split)
156
+
157
+ # Get unique sequence names.
158
+ sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))
159
+
160
+ # Load frame and sequence annotation files.
161
+ frame_file = osp.join(category_dir, "frame_annotations.jgz")
162
+ sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
163
+
164
+ with gzip.open(frame_file, "r") as fin:
165
+ frame_data = json.loads(fin.read())
166
+ with gzip.open(sequence_file, "r") as fin:
167
+ sequence_data = json.loads(fin.read())
168
+
169
+ # Organize frame annotations per sequence.
170
+ frame_data_processed = {}
171
+ for f_data in frame_data:
172
+ sequence_name = f_data["sequence_name"]
173
+ frame_data_processed.setdefault(sequence_name, {})[
174
+ f_data["frame_number"]
175
+ ] = f_data
176
+
177
+ # Select sequences with quality above the threshold.
178
+ good_quality_sequences = set()
179
+ for seq_data in sequence_data:
180
+ if seq_data["viewpoint_quality_score"] > min_quality:
181
+ good_quality_sequences.add(seq_data["sequence_name"])
182
+ sequences_numbers = [
183
+ seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences
184
+ ]
185
+ selected_sequences_numbers = sequences_numbers
186
+ selected_sequences_numbers_dict = {
187
+ seq_name: [] for seq_name in selected_sequences_numbers
188
+ }
189
+
190
+ # Filter frames to only those from selected sequences.
191
+ sequences_all = [
192
+ (seq_name, frame_number, filepath)
193
+ for seq_name, frame_number, filepath in sequences_all
194
+ if seq_name in selected_sequences_numbers_dict
195
+ ]
196
+
197
+ # Process each frame.
198
+ for seq_name, frame_number, filepath in tqdm(
199
+ sequences_all, desc="Processing frames"
200
+ ):
201
+ frame_idx = int(filepath.split("/")[-1][5:-4])
202
+ selected_sequences_numbers_dict[seq_name].append(frame_idx)
203
+ mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
204
+ frame_data_entry = frame_data_processed[seq_name][frame_number]
205
+ focal_length = frame_data_entry["viewpoint"]["focal_length"]
206
+ principal_point = frame_data_entry["viewpoint"]["principal_point"]
207
+ image_size = frame_data_entry["image"]["size"]
208
+ K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
209
+ R, tvec, camera_intrinsics = opencv_from_cameras_projection(
210
+ np.array(frame_data_entry["viewpoint"]["R"]),
211
+ np.array(frame_data_entry["viewpoint"]["T"]),
212
+ np.array(focal_length),
213
+ np.array(principal_point),
214
+ np.array(image_size),
215
+ )
216
+
217
+ # Load input image and mask.
218
+ image_path = osp.join(cop3d_dir, filepath)
219
+ mask_path_full = osp.join(cop3d_dir, mask_path)
220
+ input_rgb_image = PIL.Image.open(image_path).convert("RGB")
221
+ input_mask = plt.imread(mask_path_full)
222
+ H, W = input_mask.shape
223
+
224
+ camera_intrinsics = camera_intrinsics.numpy()
225
+ cx, cy = camera_intrinsics[:2, 2].round().astype(int)
226
+ min_margin_x = min(cx, W - cx)
227
+ min_margin_y = min(cy, H - cy)
228
+ l, t = cx - min_margin_x, cy - min_margin_y
229
+ r, b = cx + min_margin_x, cy + min_margin_y
230
+ crop_bbox = (l, t, r, b)
231
+
232
+ # Crop the image, mask, and adjust intrinsics.
233
+ input_rgb_image, input_mask, input_camera_intrinsics = (
234
+ cropping.crop_image_depthmap(
235
+ input_rgb_image, input_mask, camera_intrinsics, crop_bbox
236
+ )
237
+ )
238
+ scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
239
+ output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
240
+ if max(output_resolution) < img_size:
241
+ scale_final = (img_size / max(H, W)) + 1e-8
242
+ output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
243
+ input_rgb_image, input_mask, input_camera_intrinsics = (
244
+ cropping.rescale_image_depthmap(
245
+ input_rgb_image, input_mask, input_camera_intrinsics, output_resolution
246
+ )
247
+ )
248
+
249
+ # Generate and adjust camera pose.
250
+ camera_pose = np.eye(4, dtype=np.float32)
251
+ camera_pose[:3, :3] = R
252
+ camera_pose[:3, 3] = tvec
253
+ camera_pose = np.linalg.inv(camera_pose)
254
+
255
+ # Save processed image and mask.
256
+ save_img_path = osp.join(output_dir, filepath)
257
+ save_mask_path = osp.join(output_dir, mask_path)
258
+ os.makedirs(osp.split(save_img_path)[0], exist_ok=True)
259
+ os.makedirs(osp.split(save_mask_path)[0], exist_ok=True)
260
+ input_rgb_image.save(save_img_path)
261
+ cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
262
+
263
+ # Save metadata (intrinsics and pose).
264
+ save_meta_path = save_img_path.replace("jpg", "npz")
265
+ np.savez(
266
+ save_meta_path,
267
+ camera_intrinsics=input_camera_intrinsics,
268
+ camera_pose=camera_pose,
269
+ )
270
+
271
+ return selected_sequences_numbers_dict
272
+
273
+
274
+ def main():
275
+ parser = get_parser()
276
+ args = parser.parse_args()
277
+ assert (
278
+ args.cop3d_dir != args.output_dir
279
+ ), "Input and output directories must differ."
280
+ categories = CATEGORIES
281
+ os.makedirs(args.output_dir, exist_ok=True)
282
+
283
+ # Process each split separately.
284
+ for split in ["train", "test"]:
285
+ selected_sequences_path = osp.join(
286
+ args.output_dir, f"selected_seqs_{split}.json"
287
+ )
288
+ if os.path.isfile(selected_sequences_path):
289
+ continue
290
+
291
+ all_selected_sequences = {}
292
+ for category in categories:
293
+ category_output_dir = osp.join(args.output_dir, category)
294
+ os.makedirs(category_output_dir, exist_ok=True)
295
+ category_selected_sequences_path = osp.join(
296
+ category_output_dir, f"selected_seqs_{split}.json"
297
+ )
298
+ if os.path.isfile(category_selected_sequences_path):
299
+ with open(category_selected_sequences_path, "r") as fid:
300
+ category_selected_sequences = json.load(fid)
301
+ else:
302
+ print(f"Processing {split} - category = {category}")
303
+ category_selected_sequences = prepare_sequences(
304
+ category=category,
305
+ cop3d_dir=args.cop3d_dir,
306
+ output_dir=args.output_dir,
307
+ img_size=args.img_size,
308
+ split=split,
309
+ min_quality=args.min_quality,
310
+ seed=args.seed + CATEGORIES_IDX[category],
311
+ )
312
+ with open(category_selected_sequences_path, "w") as file:
313
+ json.dump(category_selected_sequences, file)
314
+
315
+ all_selected_sequences[category] = category_selected_sequences
316
+
317
+ with open(selected_sequences_path, "w") as file:
318
+ json.dump(all_selected_sequences, file)
319
+
320
+
321
+ if __name__ == "__main__":
322
+ main()
extern/CUT3R/datasets_preprocess/preprocess_dl3dv.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import random
3
+ import gzip
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ import os.path as osp
9
+
10
+ import torch
11
+ import PIL.Image
12
+ from PIL import Image
13
+ import numpy as np
14
+ import cv2
15
+
16
+ from tqdm import tqdm
17
+ import matplotlib.pyplot as plt
18
+ import shutil
19
+ from read_write_model import run
20
+
21
+ import torch
22
+ import torchvision
23
+
24
+
25
+ def get_parser():
26
+ import argparse
27
+
28
+ parser = argparse.ArgumentParser()
29
+ parser.add_argument("--dl3dv_dir", default="../DL3DV-Dense/3K/") # TODO
30
+ parser.add_argument("--output_dir", default="../processed_dl3dv/3K/") # TODO
31
+ return parser
32
+
33
+
34
+ from scipy.spatial.transform import Rotation as R
35
+
36
+
37
+ def read_array(path):
38
+ with open(path, "rb") as fid:
39
+ width, height, channels = np.genfromtxt(
40
+ fid, delimiter="&", max_rows=1, usecols=(0, 1, 2), dtype=int
41
+ )
42
+ fid.seek(0)
43
+ num_delimiter = 0
44
+ byte = fid.read(1)
45
+ while True:
46
+ if byte == b"&":
47
+ num_delimiter += 1
48
+ if num_delimiter >= 3:
49
+ break
50
+ byte = fid.read(1)
51
+ array = np.fromfile(fid, np.float32)
52
+ array = array.reshape((width, height, channels), order="F")
53
+ return np.transpose(array, (1, 0, 2)).squeeze()
54
+
55
+
56
+ def main(rootdir, outdir):
57
+ os.makedirs(outdir, exist_ok=True)
58
+
59
+ envs = [f for f in os.listdir(rootdir) if os.path.isdir(osp.join(rootdir, f))]
60
+ for env in tqdm(envs):
61
+ subseqs = [
62
+ f
63
+ for f in os.listdir(osp.join(rootdir, env))
64
+ if os.path.isdir(osp.join(rootdir, env, f)) and f.startswith("dense")
65
+ ]
66
+ for subseq in subseqs:
67
+ sparse_dir = osp.join(rootdir, env, subseq, "sparse")
68
+ images_dir = osp.join(rootdir, env, subseq, "images")
69
+ # depth_dir = osp.join(rootdir, env, subseq, "stereo", "depth_maps")
70
+ if (
71
+ (not os.path.exists(sparse_dir))
72
+ or (not os.path.exists(images_dir))
73
+ # or (not os.path.exists(depth_dir))
74
+ ):
75
+ continue
76
+ intrins_file = sparse_dir + "/cameras.txt"
77
+ poses_file = sparse_dir + "/images.txt"
78
+ if os.path.exists(intrins_file) and os.path.exists(poses_file):
79
+ continue
80
+ run(sparse_dir, sparse_dir)
81
+
82
+ cam_params = {}
83
+ with open(intrins_file, "r") as f:
84
+ for line in f:
85
+ if line.startswith("#"):
86
+ continue
87
+ parts = line.strip().split()
88
+ if len(parts) == 0:
89
+ continue
90
+ cam_id = int(parts[0])
91
+ fx = float(parts[4])
92
+ fy = float(parts[5])
93
+ cx = float(parts[6])
94
+ cy = float(parts[7])
95
+ cam_params[cam_id] = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
96
+
97
+ poses = []
98
+ images = []
99
+ intrinsics = []
100
+
101
+ with open(poses_file, "r") as f:
102
+ for i, line in enumerate(f):
103
+ if line.startswith("#"):
104
+ continue
105
+ parts = line.strip().split()
106
+ if len(parts) == 0:
107
+ continue
108
+ if "." in parts[0]:
109
+ continue
110
+
111
+ img_name = parts[-1]
112
+ w, x, y, z = map(float, parts[1:5])
113
+ R = np.array(
114
+ [
115
+ [
116
+ 1 - 2 * y * y - 2 * z * z,
117
+ 2 * x * y - 2 * z * w,
118
+ 2 * x * z + 2 * y * w,
119
+ ],
120
+ [
121
+ 2 * x * y + 2 * z * w,
122
+ 1 - 2 * x * x - 2 * z * z,
123
+ 2 * y * z - 2 * x * w,
124
+ ],
125
+ [
126
+ 2 * x * z - 2 * y * w,
127
+ 2 * y * z + 2 * x * w,
128
+ 1 - 2 * x * x - 2 * y * y,
129
+ ],
130
+ ]
131
+ )
132
+ tx, ty, tz = map(float, parts[5:8])
133
+ cam_id = int(parts[-2])
134
+ pose = np.eye(4)
135
+ pose[:3, :3] = R
136
+ pose[:3, 3] = [tx, ty, tz]
137
+ poses.append(np.linalg.inv(pose))
138
+ images.append(img_name)
139
+ intrinsics.append(cam_params[cam_id])
140
+
141
+ os.makedirs(osp.join(outdir, env, subseq), exist_ok=True)
142
+ os.makedirs(osp.join(outdir, env, subseq, "rgb"), exist_ok=True)
143
+ # os.makedirs(osp.join(outdir, env, subseq, "depth"), exist_ok=True)
144
+ os.makedirs(osp.join(outdir, env, subseq, "cam"), exist_ok=True)
145
+
146
+ for i, img_name in enumerate(tqdm(images)):
147
+ basename = img_name.split("/")[-1]
148
+ if os.path.exists(
149
+ osp.join(
150
+ outdir, env, subseq, "cam", basename.replace(".png", ".npz")
151
+ )
152
+ ):
153
+ print("Exist!")
154
+ continue
155
+ img_path = os.path.join(images_dir, img_name)
156
+ # depth_path = os.path.join(depth_dir, img_name + ".geometric.bin")
157
+ if not os.path.exists(depth_path) or not os.path.exists(img_path):
158
+ continue
159
+ try:
160
+ rgb = Image.open(img_path)
161
+ # depth = read_array(depth_path)
162
+ except:
163
+ continue
164
+ intrinsic = intrinsics[i]
165
+ pose = poses[i]
166
+
167
+ # save all
168
+
169
+ rgb.save(osp.join(outdir, env, subseq, "rgb", basename))
170
+ # np.save(
171
+ # osp.join(
172
+ # outdir, env, subseq, "depth", basename.replace(".png", ".npy")
173
+ # ),
174
+ # depth,
175
+ # )
176
+ np.savez(
177
+ osp.join(
178
+ outdir, env, subseq, "cam", basename.replace(".png", ".npz")
179
+ ),
180
+ intrinsic=intrinsic,
181
+ pose=pose,
182
+ )
183
+
184
+
185
+ if __name__ == "__main__":
186
+ parser = get_parser()
187
+ args = parser.parse_args()
188
+ main(args.dl3dv_dir, args.output_dir)
extern/CUT3R/datasets_preprocess/preprocess_dynamic_replica.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the Dynamic Replica dataset.
4
+
5
+ This script reads frame annotations (stored in compressed JSON files),
6
+ loads images, depth maps, optical flow, and camera parameters, and saves
7
+ processed images, depth maps, flow files, and camera metadata (intrinsics and poses)
8
+ to an output directory organized by split, sequence, and camera view.
9
+
10
+ Usage:
11
+ python preprocess_dynamic_replica.py --root_dir /path/to/data_dynamic_replica \
12
+ --out_dir /path/to/processed_dynamic_replica \
13
+ [--splits train valid test] \
14
+ [--num_processes 8]
15
+ """
16
+
17
+ import argparse
18
+ import gzip
19
+ import json
20
+ import os
21
+ import os.path as osp
22
+ import re
23
+ import shutil
24
+ import time
25
+ from collections import defaultdict
26
+ from dataclasses import dataclass
27
+ from multiprocessing import Pool, cpu_count
28
+ from typing import List, Optional
29
+
30
+ import cv2
31
+ import matplotlib.pyplot as plt
32
+ import numpy as np
33
+ import PIL.Image
34
+ import torch
35
+ from PIL import Image
36
+ from pytorch3d.implicitron.dataset.types import (
37
+ FrameAnnotation as ImplicitronFrameAnnotation,
38
+ load_dataclass,
39
+ )
40
+ from tqdm import tqdm
41
+ import imageio
42
+
43
+ # Enable OpenEXR support in OpenCV.
44
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
45
+
46
+ TAG_CHAR = np.array([202021.25], np.float32)
47
+
48
+
49
+ def readFlow(fn):
50
+ """Read .flo file in Middlebury format."""
51
+ with open(fn, "rb") as f:
52
+ magic = np.fromfile(f, np.float32, count=1)
53
+ if 202021.25 != magic:
54
+ print("Magic number incorrect. Invalid .flo file")
55
+ return None
56
+ else:
57
+ w = np.fromfile(f, np.int32, count=1)
58
+ h = np.fromfile(f, np.int32, count=1)
59
+ data = np.fromfile(f, np.float32, count=2 * int(w) * int(h))
60
+ return np.resize(data, (int(h), int(w), 2))
61
+
62
+
63
+ def readPFM(file):
64
+ with open(file, "rb") as f:
65
+ header = f.readline().rstrip()
66
+ if header == b"PF":
67
+ color = True
68
+ elif header == b"Pf":
69
+ color = False
70
+ else:
71
+ raise Exception("Not a PFM file.")
72
+
73
+ dim_match = re.match(rb"^(\d+)\s(\d+)\s$", f.readline())
74
+ if dim_match:
75
+ width, height = map(int, dim_match.groups())
76
+ else:
77
+ raise Exception("Malformed PFM header.")
78
+
79
+ scale = float(f.readline().rstrip())
80
+ endian = "<" if scale < 0 else ">"
81
+ if scale < 0:
82
+ scale = -scale
83
+
84
+ data = np.fromfile(f, endian + "f")
85
+ shape = (height, width, 3) if color else (height, width)
86
+ data = np.reshape(data, shape)
87
+ data = np.flipud(data)
88
+ return data
89
+
90
+
91
+ def read_gen(file_name, pil=False):
92
+ ext = osp.splitext(file_name)[-1].lower()
93
+ if ext in [".png", ".jpeg", ".ppm", ".jpg"]:
94
+ return Image.open(file_name)
95
+ elif ext in [".bin", ".raw"]:
96
+ return np.load(file_name)
97
+ elif ext == ".flo":
98
+ return readFlow(file_name).astype(np.float32)
99
+ elif ext == ".pfm":
100
+ flow = readPFM(file_name).astype(np.float32)
101
+ return flow if len(flow.shape) == 2 else flow[:, :, :-1]
102
+ return []
103
+
104
+
105
+ def _load_16big_png_depth(depth_png):
106
+ with Image.open(depth_png) as depth_pil:
107
+ depth = (
108
+ np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
109
+ .astype(np.float32)
110
+ .reshape((depth_pil.size[1], depth_pil.size[0]))
111
+ )
112
+ return depth
113
+
114
+
115
+ @dataclass
116
+ class DynamicReplicaFrameAnnotation(ImplicitronFrameAnnotation):
117
+ """A dataclass used to load annotations from .json for Dynamic Replica."""
118
+
119
+ camera_name: Optional[str] = None
120
+ instance_id_map_path: Optional[str] = None
121
+ flow_forward: Optional[str] = None
122
+ flow_forward_mask: Optional[str] = None
123
+ flow_backward: Optional[str] = None
124
+ flow_backward_mask: Optional[str] = None
125
+ trajectories: Optional[str] = None
126
+
127
+
128
+ def _get_pytorch3d_camera(entry_viewpoint, image_size, scale: float):
129
+ """
130
+ Convert the camera parameters stored in an annotation to PyTorch3D convention.
131
+
132
+ Returns:
133
+ R, tvec, focal, principal_point
134
+ """
135
+ assert entry_viewpoint is not None
136
+ principal_point = torch.tensor(entry_viewpoint.principal_point, dtype=torch.float)
137
+ focal_length = torch.tensor(entry_viewpoint.focal_length, dtype=torch.float)
138
+ half_image_size_wh_orig = (
139
+ torch.tensor(list(reversed(image_size)), dtype=torch.float) / 2.0
140
+ )
141
+
142
+ fmt = entry_viewpoint.intrinsics_format
143
+ if fmt.lower() == "ndc_norm_image_bounds":
144
+ rescale = half_image_size_wh_orig
145
+ elif fmt.lower() == "ndc_isotropic":
146
+ rescale = half_image_size_wh_orig.min()
147
+ else:
148
+ raise ValueError(f"Unknown intrinsics format: {fmt}")
149
+
150
+ principal_point_px = half_image_size_wh_orig - principal_point * rescale
151
+ focal_length_px = focal_length * rescale
152
+
153
+ # Prepare rotation and translation for PyTorch3D
154
+ R = torch.tensor(entry_viewpoint.R, dtype=torch.float)
155
+ T = torch.tensor(entry_viewpoint.T, dtype=torch.float)
156
+ R_pytorch3d = R.clone()
157
+ T_pytorch3d = T.clone()
158
+ T_pytorch3d[..., :2] *= -1
159
+ R_pytorch3d[..., :, :2] *= -1
160
+ tvec = T_pytorch3d
161
+
162
+ return R, tvec, focal_length_px, principal_point_px
163
+
164
+
165
+ # Global configuration for splits and output.
166
+ SPLITS = ["train", "valid", "test"]
167
+ # (You can override the default root and out_dir via command-line arguments.)
168
+
169
+
170
+ def process_split_data(args):
171
+ """
172
+ Process all frames for a given split.
173
+
174
+ Reads the frame annotation file for the given split, groups frames per sequence
175
+ and camera, and for each frame loads the image, depth map, optical flows (if available),
176
+ computes the camera intrinsics and pose (using _get_pytorch3d_camera), and saves the data.
177
+ """
178
+ split, root_dir, out_dir = args
179
+ split_dir = osp.join(root_dir, split)
180
+ # The frame annotations are stored in a compressed json file.
181
+ frame_annotations_file = osp.join(split_dir, f"frame_annotations_{split}.jgz")
182
+ with gzip.open(frame_annotations_file, "rt", encoding="utf8") as zipfile:
183
+ frame_annots_list = load_dataclass(zipfile, List[DynamicReplicaFrameAnnotation])
184
+
185
+ # Group frames by sequence and camera.
186
+ seq_annot = defaultdict(lambda: defaultdict(list))
187
+ for frame_annot in frame_annots_list:
188
+ seq_annot[frame_annot.sequence_name][frame_annot.camera_name].append(
189
+ frame_annot
190
+ )
191
+
192
+ # Process each sequence.
193
+ for seq_name in tqdm(seq_annot.keys(), desc=f"Processing split '{split}'"):
194
+ # For each camera (e.g., 'left', 'right'), create output directories.
195
+ for cam in ["left", "right"]:
196
+ out_img_dir = osp.join(out_dir, split, seq_name, cam, "rgb")
197
+ out_depth_dir = osp.join(out_dir, split, seq_name, cam, "depth")
198
+ out_fflow_dir = osp.join(out_dir, split, seq_name, cam, "flow_forward")
199
+ out_bflow_dir = osp.join(out_dir, split, seq_name, cam, "flow_backward")
200
+ out_cam_dir = osp.join(out_dir, split, seq_name, cam, "cam")
201
+ os.makedirs(out_img_dir, exist_ok=True)
202
+ os.makedirs(out_depth_dir, exist_ok=True)
203
+ os.makedirs(out_fflow_dir, exist_ok=True)
204
+ os.makedirs(out_bflow_dir, exist_ok=True)
205
+ os.makedirs(out_cam_dir, exist_ok=True)
206
+
207
+ for framedata in tqdm(
208
+ seq_annot[seq_name][cam], desc=f"Seq {seq_name} [{cam}]", leave=False
209
+ ):
210
+ timestamp = framedata.frame_timestamp
211
+ im_path = osp.join(split_dir, framedata.image.path)
212
+ depth_path = osp.join(split_dir, framedata.depth.path)
213
+ if framedata.flow_forward["path"]:
214
+ flow_forward_path = osp.join(
215
+ split_dir, framedata.flow_forward["path"]
216
+ )
217
+ flow_forward_mask_path = osp.join(
218
+ split_dir, framedata.flow_forward_mask["path"]
219
+ )
220
+ if framedata.flow_backward["path"]:
221
+ flow_backward_path = osp.join(
222
+ split_dir, framedata.flow_backward["path"]
223
+ )
224
+ flow_backward_mask_path = osp.join(
225
+ split_dir, framedata.flow_backward_mask["path"]
226
+ )
227
+
228
+ # Ensure required files exist.
229
+ assert os.path.isfile(im_path), im_path
230
+ assert os.path.isfile(depth_path), depth_path
231
+ if framedata.flow_forward["path"]:
232
+ assert os.path.isfile(flow_forward_path), flow_forward_path
233
+ assert os.path.isfile(
234
+ flow_forward_mask_path
235
+ ), flow_forward_mask_path
236
+ if framedata.flow_backward["path"]:
237
+ assert os.path.isfile(flow_backward_path), flow_backward_path
238
+ assert os.path.isfile(
239
+ flow_backward_mask_path
240
+ ), flow_backward_mask_path
241
+
242
+ viewpoint = framedata.viewpoint
243
+ # Load depth map.
244
+ depth = _load_16big_png_depth(depth_path)
245
+
246
+ # Process optical flow if available.
247
+ if framedata.flow_forward["path"]:
248
+ flow_forward = cv2.imread(flow_forward_path, cv2.IMREAD_UNCHANGED)
249
+ flow_forward_mask = cv2.imread(
250
+ flow_forward_mask_path, cv2.IMREAD_UNCHANGED
251
+ )
252
+ np.savez(
253
+ osp.join(out_fflow_dir, f"{timestamp}.npz"),
254
+ flow=flow_forward,
255
+ mask=flow_forward_mask,
256
+ )
257
+ if framedata.flow_backward["path"]:
258
+ flow_backward = cv2.imread(flow_backward_path, cv2.IMREAD_UNCHANGED)
259
+ flow_backward_mask = cv2.imread(
260
+ flow_backward_mask_path, cv2.IMREAD_UNCHANGED
261
+ )
262
+ np.savez(
263
+ osp.join(out_bflow_dir, f"{timestamp}.npz"),
264
+ flow=flow_backward,
265
+ mask=flow_backward_mask,
266
+ )
267
+
268
+ # Get camera parameters.
269
+ R, t, focal, pp = _get_pytorch3d_camera(
270
+ viewpoint, framedata.image.size, scale=1.0
271
+ )
272
+ intrinsics = np.eye(3)
273
+ intrinsics[0, 0] = focal[0].item()
274
+ intrinsics[1, 1] = focal[1].item()
275
+ intrinsics[0, 2] = pp[0].item()
276
+ intrinsics[1, 2] = pp[1].item()
277
+ pose = np.eye(4)
278
+ # Invert the camera pose.
279
+ pose[:3, :3] = R.numpy().T
280
+ pose[:3, 3] = -R.numpy().T @ t.numpy()
281
+
282
+ # Define output file paths.
283
+ out_img_path = osp.join(out_img_dir, f"{timestamp}.png")
284
+ out_depth_path = osp.join(out_depth_dir, f"{timestamp}.npy")
285
+ out_cam_path = osp.join(out_cam_dir, f"{timestamp}.npz")
286
+
287
+ # Copy RGB image.
288
+ shutil.copy(im_path, out_img_path)
289
+ # Save depth.
290
+ np.save(out_depth_path, depth)
291
+ # Save camera metadata.
292
+ np.savez(out_cam_path, intrinsics=intrinsics, pose=pose)
293
+ # (Optionally, you could return some summary information.)
294
+ return None
295
+
296
+
297
+ def main():
298
+ parser = argparse.ArgumentParser(
299
+ description="Preprocess Dynamic Replica dataset: convert raw annotations, images, "
300
+ "depth, and flow files to a processed format."
301
+ )
302
+ parser.add_argument(
303
+ "--root_dir",
304
+ type=str,
305
+ required=True,
306
+ help="Root directory of the Dynamic Replica data.",
307
+ )
308
+ parser.add_argument(
309
+ "--out_dir",
310
+ type=str,
311
+ required=True,
312
+ help="Output directory for processed data.",
313
+ )
314
+ parser.add_argument(
315
+ "--splits",
316
+ type=str,
317
+ nargs="+",
318
+ default=SPLITS,
319
+ help="List of splits to process (default: train valid test).",
320
+ )
321
+ parser.add_argument(
322
+ "--num_processes",
323
+ type=int,
324
+ default=cpu_count(),
325
+ help="Number of processes to use (default: number of CPU cores).",
326
+ )
327
+ args = parser.parse_args()
328
+
329
+ os.makedirs(args.out_dir, exist_ok=True)
330
+ tasks = [(split, args.root_dir, args.out_dir) for split in args.splits]
331
+
332
+ print("Processing splits:", args.splits)
333
+ with Pool(processes=args.num_processes) as pool:
334
+ list(
335
+ tqdm(
336
+ pool.imap(process_split_data, tasks),
337
+ total=len(tasks),
338
+ desc="Overall Progress",
339
+ )
340
+ )
341
+
342
+
343
+ if __name__ == "__main__":
344
+ main()
extern/CUT3R/datasets_preprocess/preprocess_eden.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the Eden dataset.
4
+
5
+ This script processes the Eden dataset by copying RGB images, converting depth
6
+ data from .mat files to .npy format, and saving camera intrinsics from .mat files
7
+ into a structured output directory. Files are processed in parallel using
8
+ a ProcessPoolExecutor.
9
+
10
+ Usage:
11
+ python preprocess_eden.py --root /path/to/data_raw_videos/data_eden \
12
+ --out_dir /path/to/data_raw_videos/processed_eden \
13
+ [--num_workers N]
14
+ """
15
+
16
+ import os
17
+ import shutil
18
+ import scipy.io
19
+ import numpy as np
20
+ from tqdm import tqdm
21
+ from concurrent.futures import ProcessPoolExecutor, as_completed
22
+ import argparse
23
+
24
+
25
+ def process_basename(args):
26
+ """
27
+ Process a single basename: load the corresponding image, depth, and camera
28
+ intrinsics files, then copy/save them into the output directories.
29
+
30
+ Parameters:
31
+ args (tuple): Contains (seq, basename, rgb_dir, depth_dir, cam_dir,
32
+ out_rgb_dir, out_depth_dir, out_cam_dir)
33
+ Returns:
34
+ None on success or an error message string on failure.
35
+ """
36
+ (
37
+ seq,
38
+ basename,
39
+ rgb_dir,
40
+ depth_dir,
41
+ cam_dir,
42
+ out_rgb_dir,
43
+ out_depth_dir,
44
+ out_cam_dir,
45
+ ) = args
46
+ # Define output paths.
47
+ out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
48
+ out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
49
+ out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
50
+
51
+ # Skip processing if the camera file has already been saved.
52
+ if os.path.exists(out_cam_path):
53
+ return None
54
+
55
+ try:
56
+ cam_type = "L"
57
+ img_file = os.path.join(rgb_dir, f"{basename}_{cam_type}.png")
58
+ depth_file = os.path.join(depth_dir, f"{basename}_{cam_type}.mat")
59
+ cam_file = os.path.join(cam_dir, f"{basename}.mat")
60
+
61
+ # Check if the required files exist.
62
+ if not (
63
+ os.path.exists(img_file)
64
+ and os.path.exists(depth_file)
65
+ and os.path.exists(cam_file)
66
+ ):
67
+ return f"Missing files for {basename} in {seq}"
68
+
69
+ # Load depth data.
70
+ depth_mat = scipy.io.loadmat(depth_file)
71
+ depth = depth_mat.get("Depth")
72
+ if depth is None:
73
+ return f"Depth data missing in {depth_file}"
74
+ depth = depth[..., 0]
75
+
76
+ # Load camera intrinsics.
77
+ cam_mat = scipy.io.loadmat(cam_file)
78
+ intrinsics = cam_mat.get(f"K_{cam_type}")
79
+ if intrinsics is None:
80
+ return f"Intrinsics data missing in {cam_file}"
81
+
82
+ # Copy the RGB image.
83
+ shutil.copyfile(img_file, out_img_path)
84
+ # Save the depth data.
85
+ np.save(out_depth_path, depth)
86
+ # Save the camera intrinsics.
87
+ np.savez(out_cam_path, intrinsics=intrinsics)
88
+
89
+ except Exception as e:
90
+ return f"Error processing {basename} in {seq}: {e}"
91
+
92
+ return None # Indicate success.
93
+
94
+
95
+ def main():
96
+ parser = argparse.ArgumentParser(
97
+ description="Preprocess Eden dataset: copy RGB images, process depth maps, and save camera intrinsics."
98
+ )
99
+ parser.add_argument(
100
+ "--root", type=str, default="", help="Root directory of the raw Eden data."
101
+ )
102
+ parser.add_argument(
103
+ "--out_dir",
104
+ type=str,
105
+ default="",
106
+ help="Output directory for processed Eden data.",
107
+ )
108
+ parser.add_argument(
109
+ "--num_workers",
110
+ type=int,
111
+ default=os.cpu_count(),
112
+ help="Number of worker processes to use.",
113
+ )
114
+ args = parser.parse_args()
115
+
116
+ root = args.root
117
+ out_dir = args.out_dir
118
+ # Modes typically found in the Eden dataset.
119
+ modes = ["clear", "cloudy", "overcast", "sunset", "twilight"]
120
+
121
+ rgb_root = os.path.join(root, "RGB")
122
+ depth_root = os.path.join(root, "Depth")
123
+ cam_root = os.path.join(root, "cam_matrix")
124
+
125
+ # Collect sequence directories by traversing the RGB root.
126
+ seq_dirs = []
127
+ for d in os.listdir(rgb_root):
128
+ for m in modes:
129
+ seq_path = os.path.join(rgb_root, d, m)
130
+ if os.path.isdir(seq_path):
131
+ # Save the relative path (e.g., "d/m").
132
+ seq_dirs.append(os.path.join(d, m))
133
+
134
+ all_tasks = []
135
+ for seq in seq_dirs:
136
+ rgb_dir = os.path.join(rgb_root, seq)
137
+ depth_dir = os.path.join(depth_root, seq)
138
+ cam_dir = os.path.join(cam_root, seq)
139
+
140
+ # Create output directories for this sequence.
141
+ # Replace any os.sep in the sequence name with an underscore.
142
+ seq_name = "_".join(seq.split(os.sep))
143
+ out_rgb_dir = os.path.join(out_dir, seq_name, "rgb")
144
+ out_depth_dir = os.path.join(out_dir, seq_name, "depth")
145
+ out_cam_dir = os.path.join(out_dir, seq_name, "cam")
146
+ os.makedirs(out_rgb_dir, exist_ok=True)
147
+ os.makedirs(out_depth_dir, exist_ok=True)
148
+ os.makedirs(out_cam_dir, exist_ok=True)
149
+
150
+ # Get basenames from the camera directory (assuming file extension .mat).
151
+ basenames = sorted([d[:-4] for d in os.listdir(cam_dir) if d.endswith(".mat")])
152
+
153
+ for basename in basenames:
154
+ task = (
155
+ seq,
156
+ basename,
157
+ rgb_dir,
158
+ depth_dir,
159
+ cam_dir,
160
+ out_rgb_dir,
161
+ out_depth_dir,
162
+ out_cam_dir,
163
+ )
164
+ all_tasks.append(task)
165
+
166
+ num_workers = args.num_workers
167
+ print(f"Processing {len(all_tasks)} tasks using {num_workers} workers...")
168
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
169
+ futures = {
170
+ executor.submit(process_basename, task): task[1] for task in all_tasks
171
+ }
172
+ for future in tqdm(
173
+ as_completed(futures), total=len(futures), desc="Processing tasks"
174
+ ):
175
+ error = future.result()
176
+ if error:
177
+ print(error)
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
extern/CUT3R/datasets_preprocess/preprocess_hoi4d.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ HOI4D Preprocessing Script
4
+
5
+ This script processes HOI4D data by:
6
+ 1. Searching specific subdirectories for RGB and depth images.
7
+ 2. Reading camera intrinsics from a .npy file (one per high-level scene).
8
+ 3. Rescaling the RGB images and depth maps to a fixed output resolution
9
+ (e.g., 640x480) using the 'cropping' module.
10
+ 4. Saving results (RGB, .npy depth, .npz camera intrinsics) in a new directory structure.
11
+
12
+ Usage:
13
+ python preprocess_hoi4d.py \
14
+ --root_dir /path/to/HOI4D_release \
15
+ --cam_root /path/to/camera_params \
16
+ --out_dir /path/to/processed_hoi4d
17
+ """
18
+
19
+ import os
20
+ import glob
21
+ import cv2
22
+ import numpy as np
23
+ from PIL import Image
24
+ from tqdm import tqdm
25
+ from concurrent.futures import ProcessPoolExecutor
26
+ import argparse
27
+
28
+ import src.dust3r.datasets.utils.cropping as cropping
29
+
30
+ def parse_arguments():
31
+ """
32
+ Parse command-line arguments for HOI4D preprocessing.
33
+
34
+ Returns:
35
+ argparse.Namespace: The parsed arguments.
36
+ """
37
+ parser = argparse.ArgumentParser(
38
+ description="Preprocess HOI4D dataset by rescaling RGB and depth images."
39
+ )
40
+ parser.add_argument("--root_dir", required=True,
41
+ help="Path to the HOI4D_release directory.")
42
+ parser.add_argument("--cam_root", required=True,
43
+ help="Path to the directory containing camera intrinsics.")
44
+ parser.add_argument("--out_dir", required=True,
45
+ help="Path to the directory where processed files will be saved.")
46
+ parser.add_argument("--max_workers", type=int, default=None,
47
+ help="Number of parallel workers. Default uses half of available CPU cores.")
48
+ args = parser.parse_args()
49
+ return args
50
+
51
+ def process_image(args):
52
+ """
53
+ Process a single image and depth map:
54
+ - Loads the image (using PIL) and depth (using OpenCV).
55
+ - Converts depth from mm to meters (divided by 1000).
56
+ - Rescales both using 'cropping.rescale_image_depthmap'.
57
+ - Saves the rescaled image (.png), depth (.npy), and camera intrinsics (.npz).
58
+
59
+ Args:
60
+ args (tuple): A tuple of:
61
+ (img_path, depth_path, out_img_path, out_depth_path, out_cam_path, intrinsics)
62
+
63
+ Returns:
64
+ None. Errors are printed to the console but do not stop the workflow.
65
+ """
66
+ img_path, depth_path, out_img_path, out_depth_path, out_cam_path, intrinsics = args
67
+
68
+ try:
69
+ # Load image
70
+ img = Image.open(img_path)
71
+
72
+ # Load depth (in mm) and convert to meters
73
+ depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH)
74
+ if depth is None:
75
+ raise ValueError(f"Could not read depth image: {depth_path}")
76
+ depth = depth.astype(np.float32) / 1000.0
77
+
78
+ # Rescale image and depth map
79
+ img_rescaled, depth_rescaled, intrinsics_rescaled = cropping.rescale_image_depthmap(
80
+ img, depth, intrinsics.copy(), (640, 480)
81
+ )
82
+
83
+ # Save processed data
84
+ img_rescaled.save(out_img_path) # PNG image
85
+ np.save(out_depth_path, depth_rescaled) # Depth .npy
86
+ np.savez(out_cam_path, intrinsics=intrinsics_rescaled)
87
+
88
+ except Exception as e:
89
+ print(f"Error processing {img_path}: {e}")
90
+
91
+ def main():
92
+ args = parse_arguments()
93
+
94
+ root = args.root_dir
95
+ cam_root = args.cam_root
96
+ out_dir = args.out_dir
97
+ if not os.path.exists(out_dir):
98
+ os.makedirs(out_dir, exist_ok=True)
99
+
100
+ # Collect a list of subdirectories using a glob pattern
101
+ # e.g.: root/ZY2021*/H*/C*/N*/S*/s*/T*
102
+ scene_dirs = glob.glob(os.path.join(root, "ZY2021*", "H*", "C*", "N*", "S*", "s*", "T*"))
103
+
104
+ # Build tasks
105
+ tasks = []
106
+ for scene_dir in tqdm(scene_dirs, desc="Collecting scenes"):
107
+ # Build an output sub-directory name
108
+ # Example: "ZY202101/H1/C1/N1/S1/s1/T1" -> "ZY202101_H1_C1_N1_S1_s1_T1"
109
+ scene_relpath = os.path.relpath(scene_dir, root)
110
+ scene_name = "_".join(scene_relpath.split(os.sep))
111
+
112
+ # Load camera intrinsics from a .npy file in cam_root
113
+ # e.g., first token of scene_relpath might point to the relevant .npy
114
+ # "ZY202101" -> "cam_root/ZY202101/intrin.npy" (adjust logic as needed)
115
+ top_level = scene_relpath.split(os.sep)[0]
116
+ cam_file = os.path.join(cam_root, top_level, "intrin.npy")
117
+ if not os.path.isfile(cam_file):
118
+ print(f"Warning: Camera file not found: {cam_file}. Skipping {scene_dir}")
119
+ continue
120
+ intrinsics = np.load(cam_file)
121
+
122
+ # Directories for this sequence
123
+ rgb_dir = os.path.join(scene_dir, "align_rgb")
124
+ depth_dir = os.path.join(scene_dir, "align_depth")
125
+
126
+ # Output directories
127
+ out_rgb_dir = os.path.join(out_dir, scene_name, "rgb")
128
+ out_depth_dir = os.path.join(out_dir, scene_name, "depth")
129
+ out_cam_dir = os.path.join(out_dir, scene_name, "cam")
130
+ os.makedirs(out_rgb_dir, exist_ok=True)
131
+ os.makedirs(out_depth_dir, exist_ok=True)
132
+ os.makedirs(out_cam_dir, exist_ok=True)
133
+
134
+ # Find all image paths
135
+ img_paths = sorted(glob.glob(os.path.join(rgb_dir, "*.jpg")))
136
+
137
+ # Build tasks for each image
138
+ for img_path in img_paths:
139
+ basename = os.path.splitext(os.path.basename(img_path))[0]
140
+ depth_path = os.path.join(depth_dir, f"{basename}.png")
141
+
142
+ out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
143
+ out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
144
+ out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
145
+
146
+ # Skip if already processed
147
+ if (os.path.exists(out_img_path) and os.path.exists(out_depth_path) and
148
+ os.path.exists(out_cam_path)):
149
+ continue
150
+
151
+ task = (
152
+ img_path,
153
+ depth_path,
154
+ out_img_path,
155
+ out_depth_path,
156
+ out_cam_path,
157
+ intrinsics
158
+ )
159
+ tasks.append(task)
160
+
161
+ # Process tasks in parallel
162
+ max_workers = args.max_workers
163
+ if max_workers is None:
164
+ max_workers = max(1, os.cpu_count() // 2)
165
+
166
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
167
+ list(tqdm(
168
+ executor.map(process_image, tasks),
169
+ total=len(tasks),
170
+ desc="Processing images"
171
+ ))
172
+
173
+
174
+ if __name__ == "__main__":
175
+ main()
extern/CUT3R/datasets_preprocess/preprocess_hypersim.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the Hypersim dataset.
4
+
5
+ This script reads camera parameters from a CSV file, converts an OpenGL-style
6
+ projection matrix into a camera intrinsic matrix, applies tone mapping, and
7
+ saves processed RGB images, depth maps, and camera metadata into an output
8
+ directory. Processing is done per scene and per camera view.
9
+
10
+ Usage:
11
+ python preprocess_hypersim.py --hypersim_dir /path/to/hypersim \
12
+ --output_dir /path/to/processed_hypersim
13
+ """
14
+
15
+ import argparse
16
+ import os
17
+ import shutil
18
+ import time
19
+
20
+ import cv2
21
+ import h5py
22
+ import matplotlib.pyplot as plt
23
+ import numpy as np
24
+ import pandas as pd
25
+ from PIL import Image
26
+ from tqdm import tqdm
27
+
28
+ # Ensure OpenEXR support for OpenCV.
29
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
30
+
31
+
32
+ def get_parser():
33
+ parser = argparse.ArgumentParser(
34
+ description="Preprocess the Hypersim dataset by converting projection "
35
+ "matrices, applying tone mapping, and saving processed outputs."
36
+ )
37
+ parser.add_argument(
38
+ "--hypersim_dir",
39
+ default="/path/to/hypersim",
40
+ help="Root directory of the Hypersim dataset.",
41
+ )
42
+ parser.add_argument(
43
+ "--output_dir",
44
+ default="/path/to/processed_hypersim",
45
+ help="Output directory for processed Hypersim data.",
46
+ )
47
+ return parser
48
+
49
+
50
+ def opengl_to_intrinsics(proj_matrix, width_pixels, height_pixels):
51
+ # Extract parameters from the projection matrix.
52
+ K00 = proj_matrix[0, 0] * width_pixels / 2.0
53
+ K01 = -proj_matrix[0, 1] * width_pixels / 2.0
54
+ K02 = (1.0 - proj_matrix[0, 2]) * width_pixels / 2.0
55
+ K11 = proj_matrix[1, 1] * height_pixels / 2.0
56
+ K12 = (1.0 + proj_matrix[1, 2]) * height_pixels / 2.0
57
+ return np.array([[K00, K01, K02], [0.0, K11, K12], [0.0, 0.0, 1.0]])
58
+
59
+
60
+ def process_scene(args):
61
+ rootdir, outdir, scene_name = args
62
+ scene_outdir = os.path.join(outdir, scene_name)
63
+ os.makedirs(scene_outdir, exist_ok=True)
64
+ seq_dir = os.path.join(rootdir, scene_name)
65
+ seq_detail_dir = os.path.join(seq_dir, "_detail")
66
+ seq_images_dir = os.path.join(seq_dir, "images")
67
+
68
+ # Read global camera parameters from the CSV file.
69
+ all_metafile = os.path.join(rootdir, "metadata_camera_parameters.csv")
70
+ df_camera_parameters = pd.read_csv(all_metafile, index_col="scene_name")
71
+ df_ = df_camera_parameters.loc[scene_name]
72
+
73
+ width_pixels = int(df_["settings_output_img_width"])
74
+ height_pixels = int(df_["settings_output_img_height"])
75
+
76
+ M_proj = np.array(
77
+ [
78
+ [df_["M_proj_00"], df_["M_proj_01"], df_["M_proj_02"], df_["M_proj_03"]],
79
+ [df_["M_proj_10"], df_["M_proj_11"], df_["M_proj_12"], df_["M_proj_13"]],
80
+ [df_["M_proj_20"], df_["M_proj_21"], df_["M_proj_22"], df_["M_proj_23"]],
81
+ [df_["M_proj_30"], df_["M_proj_31"], df_["M_proj_32"], df_["M_proj_33"]],
82
+ ]
83
+ )
84
+
85
+ camera_intrinsics = opengl_to_intrinsics(
86
+ M_proj, width_pixels, height_pixels
87
+ ).astype(np.float32)
88
+ if camera_intrinsics[0, 1] != 0:
89
+ print(f"camera_intrinsics[0, 1] != 0: {camera_intrinsics[0, 1]}")
90
+ return
91
+
92
+ # Read world scale and camera IDs.
93
+ worldscale = (
94
+ pd.read_csv(
95
+ os.path.join(seq_detail_dir, "metadata_scene.csv"),
96
+ index_col="parameter_name",
97
+ )
98
+ .to_numpy()
99
+ .flatten()[0]
100
+ .astype(np.float32)
101
+ )
102
+ camera_ids = (
103
+ pd.read_csv(
104
+ os.path.join(seq_detail_dir, "metadata_cameras.csv"),
105
+ header=None,
106
+ skiprows=1,
107
+ )
108
+ .to_numpy()
109
+ .flatten()
110
+ )
111
+
112
+ # Tone mapping parameters.
113
+ gamma = 1.0 / 2.2 # Standard gamma correction exponent.
114
+ inv_gamma = 1.0 / gamma
115
+ percentile = 90 # Desired percentile brightness in the unmodified image.
116
+ brightness_nth_percentile_desired = 0.8 # Desired brightness after scaling.
117
+
118
+ for camera_id in camera_ids:
119
+ subscene_dir = os.path.join(scene_outdir, f"{camera_id}")
120
+ os.makedirs(subscene_dir, exist_ok=True)
121
+ camera_dir = os.path.join(seq_detail_dir, camera_id)
122
+ if not os.path.exists(camera_dir):
123
+ print(f"{camera_dir} does not exist.")
124
+ continue
125
+ color_dir = os.path.join(seq_images_dir, f"scene_{camera_id}_final_hdf5")
126
+ geometry_dir = os.path.join(seq_images_dir, f"scene_{camera_id}_geometry_hdf5")
127
+ if not (os.path.exists(color_dir) and os.path.exists(geometry_dir)):
128
+ print(f"{color_dir} or {geometry_dir} does not exist.")
129
+ continue
130
+
131
+ camera_positions_hdf5_file = os.path.join(
132
+ camera_dir, "camera_keyframe_positions.hdf5"
133
+ )
134
+ camera_orientations_hdf5_file = os.path.join(
135
+ camera_dir, "camera_keyframe_orientations.hdf5"
136
+ )
137
+
138
+ with h5py.File(camera_positions_hdf5_file, "r") as f:
139
+ camera_positions = f["dataset"][:]
140
+ with h5py.File(camera_orientations_hdf5_file, "r") as f:
141
+ camera_orientations = f["dataset"][:]
142
+
143
+ assert len(camera_positions) == len(
144
+ camera_orientations
145
+ ), f"len(camera_positions)={len(camera_positions)} != len(camera_orientations)={len(camera_orientations)}"
146
+
147
+ rgbs = sorted([f for f in os.listdir(color_dir) if f.endswith(".color.hdf5")])
148
+ depths = sorted(
149
+ [f for f in os.listdir(geometry_dir) if f.endswith(".depth_meters.hdf5")]
150
+ )
151
+ assert len(rgbs) == len(
152
+ depths
153
+ ), f"len(rgbs)={len(rgbs)} != len(depths)={len(depths)}"
154
+ exist_frame_ids = [int(f.split(".")[1]) for f in rgbs]
155
+ valid_camera_positions = camera_positions[exist_frame_ids]
156
+ valid_camera_orientations = camera_orientations[exist_frame_ids]
157
+
158
+ for i, (rgb, depth) in enumerate(tqdm(zip(rgbs, depths), total=len(rgbs))):
159
+ frame_id = int(rgb.split(".")[1])
160
+ assert frame_id == int(
161
+ depth.split(".")[1]
162
+ ), f"frame_id={frame_id} != {int(depth.split('.')[1])}"
163
+ # Tone mapping.
164
+ render_entity = os.path.join(
165
+ geometry_dir,
166
+ depth.replace("depth_meters.hdf5", "render_entity_id.hdf5"),
167
+ )
168
+ with h5py.File(os.path.join(color_dir, rgb), "r") as f:
169
+ color = f["dataset"][:]
170
+ with h5py.File(os.path.join(geometry_dir, depth), "r") as f:
171
+ distance = f["dataset"][:]
172
+ R_cam2world = valid_camera_orientations[i]
173
+ R_cam2world = R_cam2world @ np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]])
174
+ t_cam2world = valid_camera_positions[i] * worldscale
175
+ T_cam2world = np.eye(4)
176
+ T_cam2world[:3, :3] = R_cam2world
177
+ T_cam2world[:3, 3] = t_cam2world
178
+
179
+ if not np.isfinite(T_cam2world).all():
180
+ print(f"frame_id={frame_id} T_cam2world is not finite.")
181
+ continue
182
+
183
+ focal = (camera_intrinsics[0, 0] + camera_intrinsics[1, 1]) / 2.0
184
+ ImageplaneX = (
185
+ np.linspace(
186
+ (-0.5 * width_pixels) + 0.5,
187
+ (0.5 * width_pixels) - 0.5,
188
+ width_pixels,
189
+ )
190
+ .reshape(1, width_pixels)
191
+ .repeat(height_pixels, 0)
192
+ .astype(np.float32)[:, :, None]
193
+ )
194
+ ImageplaneY = (
195
+ np.linspace(
196
+ (-0.5 * height_pixels) + 0.5,
197
+ (0.5 * height_pixels) - 0.5,
198
+ height_pixels,
199
+ )
200
+ .reshape(height_pixels, 1)
201
+ .repeat(width_pixels, 1)
202
+ .astype(np.float32)[:, :, None]
203
+ )
204
+ ImageplaneZ = np.full([height_pixels, width_pixels, 1], focal, np.float32)
205
+ Imageplane = np.concatenate([ImageplaneX, ImageplaneY, ImageplaneZ], axis=2)
206
+
207
+ depth = distance / np.linalg.norm(Imageplane, axis=2) * focal
208
+
209
+ with h5py.File(render_entity, "r") as f:
210
+ render_entity_id = f["dataset"][:].astype(np.int32)
211
+ assert (render_entity_id != 0).all()
212
+ valid_mask = render_entity_id != -1
213
+
214
+ if np.sum(valid_mask) == 0:
215
+ scale = 1.0 # If there are no valid pixels, set scale to 1.0.
216
+ else:
217
+ brightness = (
218
+ 0.3 * color[:, :, 0] + 0.59 * color[:, :, 1] + 0.11 * color[:, :, 2]
219
+ )
220
+ brightness_valid = brightness[valid_mask]
221
+ eps = 0.0001 # Avoid division by zero.
222
+ brightness_nth_percentile_current = np.percentile(
223
+ brightness_valid, percentile
224
+ )
225
+ if brightness_nth_percentile_current < eps:
226
+ scale = 0.0
227
+ else:
228
+ scale = (
229
+ np.power(brightness_nth_percentile_desired, inv_gamma)
230
+ / brightness_nth_percentile_current
231
+ )
232
+
233
+ color = np.power(np.maximum(scale * color, 0), gamma)
234
+ color = np.clip(color, 0.0, 1.0)
235
+
236
+ out_rgb_path = os.path.join(subscene_dir, f"{frame_id:06d}_rgb.png")
237
+ Image.fromarray((color * 255).astype(np.uint8)).save(out_rgb_path)
238
+ out_depth_path = os.path.join(subscene_dir, f"{frame_id:06d}_depth.npy")
239
+ np.save(out_depth_path, depth.astype(np.float32))
240
+ out_cam_path = os.path.join(subscene_dir, f"{frame_id:06d}_cam.npz")
241
+ np.savez(
242
+ out_cam_path,
243
+ intrinsics=camera_intrinsics,
244
+ pose=T_cam2world.astype(np.float32),
245
+ )
246
+
247
+
248
+ def main():
249
+ parser = get_parser()
250
+ args = parser.parse_args()
251
+
252
+ # Use placeholder paths to avoid personal/private information.
253
+ rootdir = args.hypersim_dir # e.g., '/path/to/hypersim'
254
+ outdir = args.output_dir # e.g., '/path/to/processed_hypersim'
255
+ os.makedirs(outdir, exist_ok=True)
256
+
257
+ import multiprocessing
258
+
259
+ scenes = sorted(
260
+ [f for f in os.listdir(rootdir) if os.path.isdir(os.path.join(rootdir, f))]
261
+ )
262
+ # Process each scene sequentially (or use multiprocessing if desired)
263
+ for scene in scenes:
264
+ process_scene((rootdir, outdir, scene))
265
+
266
+
267
+ if __name__ == "__main__":
268
+ main()
extern/CUT3R/datasets_preprocess/preprocess_irs.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the IRS dataset.
4
+
5
+ This script converts disparity EXR files into depth maps, copies corresponding RGB images,
6
+ and saves camera intrinsics computed from a given focal length and baseline. Processing is
7
+ done per sequence directory using parallel processing.
8
+
9
+ Usage:
10
+ python preprocess_irs.py
11
+ --root_dir /path/to/data_irs
12
+ --out_dir /path/to/processed_irs
13
+ """
14
+
15
+ import os
16
+ import shutil
17
+ import re
18
+ import glob
19
+ import time
20
+ from concurrent.futures import ProcessPoolExecutor, as_completed
21
+
22
+ import numpy as np
23
+ import OpenEXR
24
+ import Imath
25
+ import imageio
26
+ from PIL import Image
27
+ from tqdm import tqdm
28
+ import argparse
29
+
30
+ # Ensure OpenEXR support in OpenCV if needed.
31
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
32
+
33
+
34
+ def exr2hdr(exrpath):
35
+ """
36
+ Read an OpenEXR file and return an HDR image as a NumPy array.
37
+ """
38
+ file = OpenEXR.InputFile(exrpath)
39
+ pixType = Imath.PixelType(Imath.PixelType.FLOAT)
40
+ dw = file.header()["dataWindow"]
41
+ num_channels = len(file.header()["channels"].keys())
42
+ if num_channels > 1:
43
+ channels = ["R", "G", "B"]
44
+ num_channels = 3
45
+ else:
46
+ channels = ["G"]
47
+
48
+ size = (dw.max.x - dw.min.x + 1, dw.max.y - dw.min.y + 1)
49
+ pixels = [
50
+ np.fromstring(file.channel(c, pixType), dtype=np.float32) for c in channels
51
+ ]
52
+ hdr = np.zeros((size[1], size[0], num_channels), dtype=np.float32)
53
+ if num_channels == 1:
54
+ hdr[:, :, 0] = np.reshape(pixels[0], (size[1], size[0]))
55
+ else:
56
+ hdr[:, :, 0] = np.reshape(pixels[0], (size[1], size[0]))
57
+ hdr[:, :, 1] = np.reshape(pixels[1], (size[1], size[0]))
58
+ hdr[:, :, 2] = np.reshape(pixels[2], (size[1], size[0]))
59
+ return hdr
60
+
61
+
62
+ def writehdr(hdrpath, hdr):
63
+ """
64
+ Write an HDR image to a file using the HDR format.
65
+ If the input has one channel, duplicate it across R, G, and B.
66
+ """
67
+ h, w, c = hdr.shape
68
+ if c == 1:
69
+ hdr = np.pad(hdr, ((0, 0), (0, 0), (0, 2)), "constant")
70
+ hdr[:, :, 1] = hdr[:, :, 0]
71
+ hdr[:, :, 2] = hdr[:, :, 0]
72
+ imageio.imwrite(hdrpath, hdr, format="hdr")
73
+
74
+
75
+ def load_exr(filename):
76
+ """
77
+ Load an EXR file and return the HDR image as a NumPy array.
78
+ """
79
+ hdr = exr2hdr(filename)
80
+ h, w, c = hdr.shape
81
+ if c == 1:
82
+ hdr = np.squeeze(hdr)
83
+ return hdr
84
+
85
+
86
+ def process_basename(args):
87
+ """
88
+ Process a single basename:
89
+ - Load an RGB image and disparity (EXR) file.
90
+ - Compute a depth map from disparity using: depth = (baseline * f) / disparity.
91
+ - Copy the RGB image and save the computed depth and camera intrinsics.
92
+
93
+ Parameters:
94
+ args: tuple containing
95
+ (basename, seq_dir, out_rgb_dir, out_depth_dir, out_cam_dir, f, baseline)
96
+ Returns:
97
+ None on success or an error string on failure.
98
+ """
99
+ basename, seq_dir, out_rgb_dir, out_depth_dir, out_cam_dir, f, baseline = args
100
+ out_img_path = os.path.join(out_rgb_dir, f"{basename}.png")
101
+ out_depth_path = os.path.join(out_depth_dir, f"{basename}.npy")
102
+ out_cam_path = os.path.join(out_cam_dir, f"{basename}.npz")
103
+ if os.path.exists(out_cam_path):
104
+ return
105
+
106
+ try:
107
+ img_file = os.path.join(seq_dir, f"l_{basename}.png")
108
+ disp_file = os.path.join(seq_dir, f"d_{basename}.exr")
109
+
110
+ # Load image using PIL.
111
+ img = Image.open(img_file)
112
+
113
+ # Load disparity using the custom load_exr function.
114
+ disp = load_exr(disp_file).astype(np.float32)
115
+ H, W = disp.shape
116
+
117
+ # Verify that the image size matches the disparity map.
118
+ if img.size != (W, H):
119
+ return f"Size mismatch for {basename}: Image size {img.size}, Disparity size {(W, H)}"
120
+
121
+ # Create a simple camera intrinsics matrix.
122
+ K = np.eye(3, dtype=np.float32)
123
+ K[0, 0] = f
124
+ K[1, 1] = f
125
+ K[0, 2] = W // 2
126
+ K[1, 2] = H // 2
127
+
128
+ # Compute depth from disparity.
129
+ depth = baseline * f / disp
130
+
131
+ # Copy the RGB image.
132
+ shutil.copyfile(img_file, out_img_path)
133
+ # Save the depth map.
134
+ np.save(out_depth_path, depth)
135
+ # Save the camera intrinsics.
136
+ np.savez(out_cam_path, intrinsics=K)
137
+
138
+ except Exception as e:
139
+ return f"Error processing {basename}: {e}"
140
+
141
+ return None
142
+
143
+
144
+ def main():
145
+ parser = argparse.ArgumentParser(
146
+ description="Preprocess IRS dataset: convert EXR disparity to depth, "
147
+ "copy RGB images, and save camera intrinsics."
148
+ )
149
+ parser.add_argument(
150
+ "--root_dir",
151
+ type=str,
152
+ default="/path/to/data_raw_videos/data_irs",
153
+ help="Root directory of the raw IRS data.",
154
+ )
155
+ parser.add_argument(
156
+ "--out_dir",
157
+ type=str,
158
+ default="/path/to/data_raw_videos/processed_irs",
159
+ help="Output directory for processed IRS data.",
160
+ )
161
+ args = parser.parse_args()
162
+
163
+ # Example parameters (adjust as needed)
164
+ baseline = 0.1
165
+ f = 480
166
+
167
+ root = args.root_dir
168
+ out_dir = args.out_dir
169
+
170
+ # Gather sequence directories.
171
+ seq_dirs = []
172
+ for d in os.listdir(root):
173
+ if os.path.isdir(os.path.join(root, d)):
174
+ if d == "Store":
175
+ for sub in os.listdir(os.path.join(root, d)):
176
+ if os.path.isdir(os.path.join(root, d, sub)):
177
+ seq_dirs.append(os.path.join(d, sub))
178
+ elif d == "IRS_small":
179
+ for sub in os.listdir(os.path.join(root, d)):
180
+ if os.path.isdir(os.path.join(root, d, sub)):
181
+ for subsub in os.listdir(os.path.join(root, d, sub)):
182
+ if os.path.isdir(os.path.join(root, d, sub, subsub)):
183
+ seq_dirs.append(os.path.join(d, sub, subsub))
184
+ else:
185
+ seq_dirs.append(d)
186
+
187
+ seq_dirs.sort()
188
+
189
+ # Process each sequence.
190
+ for seq in seq_dirs:
191
+ seq_dir = os.path.join(root, seq)
192
+ out_rgb_dir = os.path.join(out_dir, seq, "rgb")
193
+ out_depth_dir = os.path.join(out_dir, seq, "depth")
194
+ out_cam_dir = os.path.join(out_dir, seq, "cam")
195
+
196
+ os.makedirs(out_rgb_dir, exist_ok=True)
197
+ os.makedirs(out_depth_dir, exist_ok=True)
198
+ os.makedirs(out_cam_dir, exist_ok=True)
199
+
200
+ # Get basenames from disparity files.
201
+ basenames = sorted([d[2:-4] for d in os.listdir(seq_dir) if d.endswith(".exr")])
202
+
203
+ tasks = []
204
+ for basename in basenames:
205
+ task = (
206
+ basename,
207
+ seq_dir,
208
+ out_rgb_dir,
209
+ out_depth_dir,
210
+ out_cam_dir,
211
+ f,
212
+ baseline,
213
+ )
214
+ tasks.append(task)
215
+
216
+ num_workers = os.cpu_count() // 2
217
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
218
+ futures = {
219
+ executor.submit(process_basename, task): task[0] for task in tasks
220
+ }
221
+ for future in tqdm(
222
+ as_completed(futures), total=len(futures), desc=f"Processing {seq}"
223
+ ):
224
+ error = future.result()
225
+ if error:
226
+ print(error)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ main()
extern/CUT3R/datasets_preprocess/preprocess_mapfree.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import os
3
+ import argparse
4
+ import glob
5
+
6
+
7
+ def get_parser():
8
+ parser = argparse.ArgumentParser()
9
+ parser.add_argument(
10
+ "--mapfree_dir",
11
+ default="mapfree/train/",
12
+ )
13
+ parser.add_argument(
14
+ "--colmap_dir",
15
+ default="mapfree/colmap",
16
+ )
17
+ parser.add_argument(
18
+ "--output_dir",
19
+ default="processed_mapfree",
20
+ )
21
+ return parser
22
+
23
+
24
+ def run_patch_match_stereo(root_colmap_dir, root_img_dir):
25
+ scene_names = sorted(os.listdir(root_colmap_dir))
26
+ sub_dir_names = ["seq0", "seq1"]
27
+ for scene_name in scene_names:
28
+ scene_dir = os.path.join(root_colmap_dir, scene_name)
29
+ img_dir = os.path.join(root_img_dir, scene_name)
30
+ for i, sub in enumerate(sub_dir_names):
31
+ sub_dir = os.path.join(scene_dir, sub)
32
+ out_dir = os.path.join(scene_dir, f"dense{i}")
33
+ if not os.path.exists(sub_dir):
34
+ continue
35
+ if os.path.exists(out_dir) and os.path.exists(
36
+ os.path.join(out_dir, f"stereo/depth_maps/{sub}")
37
+ ):
38
+ if len(
39
+ glob.glob(
40
+ os.path.join(out_dir, f"stereo/depth_maps/{sub}/*geometric.bin")
41
+ )
42
+ ) == len(glob.glob(os.path.join(img_dir, sub, "*.jpg"))):
43
+ print(f"depth maps already computed, skip {sub_dir}")
44
+ continue
45
+
46
+ print(sub_dir)
47
+ cmd = f"colmap image_undistorter \
48
+ --image_path {img_dir} \
49
+ --input_path {sub_dir} \
50
+ --output_path {out_dir} \
51
+ --output_type COLMAP;"
52
+
53
+ subprocess.call(cmd, shell=True)
54
+ cmd = f"rm -rf {out_dir}/images/seq{i}; rm -rf {out_dir}/sparse;"
55
+ cmd += f"cp -r {sub_dir} {out_dir}/sparse;"
56
+ cmd += f"cp -r {img_dir}/{sub} {out_dir}/images;"
57
+ subprocess.call(cmd, shell=True)
58
+
59
+ # we comment this because we have released the mvs results, but feel free to re-run the mvs
60
+
61
+ # cmd = f"colmap patch_match_stereo \
62
+ # --workspace_path {out_dir} \
63
+ # --workspace_format COLMAP \
64
+ # --PatchMatchStereo.cache_size 512 \
65
+ # --PatchMatchStereo.geom_consistency true"
66
+ # subprocess.call(cmd, shell=True)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ parser = get_parser()
71
+ args = parser.parse_args()
72
+ root_colmap_dir = args.colmap_dir
73
+ root_img_dir = args.mapfree_dir
74
+
75
+ # run patch match stereo
76
+ run_patch_match_stereo(root_colmap_dir, root_img_dir)
extern/CUT3R/datasets_preprocess/preprocess_mapfree2.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ import os.path as osp
5
+
6
+ from PIL import Image
7
+ import numpy as np
8
+
9
+
10
+ from tqdm import tqdm
11
+ from read_write_model import run
12
+
13
+
14
+ def get_parser():
15
+ import argparse
16
+
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--mapfree_dir", default="") # TODO
19
+ parser.add_argument("--output_dir", default="test_preprocess") # TODO
20
+ return parser
21
+
22
+
23
+ def main(rootdir, outdir):
24
+ os.makedirs(outdir, exist_ok=True)
25
+
26
+ envs = [f for f in os.listdir(rootdir) if os.path.isdir(osp.join(rootdir, f))]
27
+ for env in tqdm(envs):
28
+ subseqs = [
29
+ f
30
+ for f in os.listdir(osp.join(rootdir, env))
31
+ if os.path.isdir(osp.join(rootdir, env, f))
32
+ ]
33
+ for subseq in subseqs:
34
+ sparse_dir = osp.join(rootdir, env, subseq, "sparse")
35
+ images_dir = osp.join(rootdir, env, subseq, "images")
36
+ run(sparse_dir, sparse_dir)
37
+ intrins_file = sparse_dir + "/cameras.txt"
38
+ poses_file = sparse_dir + "/images.txt"
39
+
40
+ cam_params = {}
41
+ with open(intrins_file, "r") as f:
42
+ for line in f:
43
+ if line.startswith("#"):
44
+ continue
45
+ parts = line.strip().split()
46
+ if len(parts) == 0:
47
+ continue
48
+ cam_id = int(parts[0])
49
+ fx = float(parts[4])
50
+ fy = float(parts[5])
51
+ cx = float(parts[6])
52
+ cy = float(parts[7])
53
+ cam_params[cam_id] = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
54
+
55
+ poses = []
56
+ images = []
57
+ intrinsics = []
58
+
59
+ with open(poses_file, "r") as f:
60
+ for i, line in enumerate(f):
61
+ if line.startswith("#"):
62
+ continue
63
+ parts = line.strip().split()
64
+ if len(parts) == 0:
65
+ continue
66
+ if "." in parts[0]:
67
+ continue
68
+
69
+ img_name = parts[-1]
70
+ w, x, y, z = map(float, parts[1:5])
71
+ R = np.array(
72
+ [
73
+ [
74
+ 1 - 2 * y * y - 2 * z * z,
75
+ 2 * x * y - 2 * z * w,
76
+ 2 * x * z + 2 * y * w,
77
+ ],
78
+ [
79
+ 2 * x * y + 2 * z * w,
80
+ 1 - 2 * x * x - 2 * z * z,
81
+ 2 * y * z - 2 * x * w,
82
+ ],
83
+ [
84
+ 2 * x * z - 2 * y * w,
85
+ 2 * y * z + 2 * x * w,
86
+ 1 - 2 * x * x - 2 * y * y,
87
+ ],
88
+ ]
89
+ )
90
+ tx, ty, tz = map(float, parts[5:8])
91
+ cam_id = int(parts[-2])
92
+ pose = np.eye(4)
93
+ pose[:3, :3] = R
94
+ pose[:3, 3] = [tx, ty, tz]
95
+ poses.append(np.linalg.inv(pose))
96
+ images.append(img_name)
97
+ intrinsics.append(cam_params[cam_id])
98
+
99
+ os.makedirs(osp.join(outdir, env, subseq), exist_ok=True)
100
+ os.makedirs(osp.join(outdir, env, subseq, "rgb"), exist_ok=True)
101
+ os.makedirs(osp.join(outdir, env, subseq, "cam"), exist_ok=True)
102
+
103
+ for i, img_name in enumerate(tqdm(images)):
104
+ img_path = os.path.join(images_dir, img_name)
105
+ rgb = Image.open(img_path)
106
+ intrinsic = intrinsics[i]
107
+ pose = poses[i]
108
+ # save all
109
+ basename = img_name.split("/")[-1]
110
+ rgb.save(osp.join(outdir, env, subseq, "rgb", basename))
111
+ np.savez(
112
+ osp.join(
113
+ outdir, env, subseq, "cam", basename.replace(".jpg", ".npz")
114
+ ),
115
+ intrinsic=intrinsic,
116
+ pose=pose,
117
+ )
118
+
119
+
120
+ if __name__ == "__main__":
121
+ parser = get_parser()
122
+ args = parser.parse_args()
123
+ main(args.mapfree_dir, args.output_dir)
extern/CUT3R/datasets_preprocess/preprocess_megadepth.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Preprocessing code for the MegaDepth dataset
3
+ # dataset at https://www.cs.cornell.edu/projects/megadepth/
4
+ # --------------------------------------------------------
5
+ import os
6
+ import os.path as osp
7
+ import collections
8
+ from tqdm import tqdm
9
+ import numpy as np
10
+
11
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
12
+ import cv2
13
+ import h5py
14
+
15
+ import path_to_root # noqa
16
+ from datasets_preprocess.utils.parallel import parallel_threads
17
+ from datasets_preprocess.utils import cropping # noqa
18
+
19
+
20
+ def get_parser():
21
+ import argparse
22
+
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("--megadepth_dir", required=True)
25
+ parser.add_argument("--num_views", default=64, type=int)
26
+ parser.add_argument("--precomputed_sets", required=True)
27
+ parser.add_argument("--output_dir", default="data/dust3r_data/processed_megadepth")
28
+ return parser
29
+
30
+
31
+ def main(db_root, pairs_path, output_dir, num_views):
32
+ os.makedirs(output_dir, exist_ok=True)
33
+
34
+ # load all pairs
35
+ data = np.load(pairs_path, allow_pickle=True)
36
+ scenes = data["scenes"]
37
+ images = data["images"]
38
+ sets = data["sets"]
39
+
40
+ # enumerate all unique images
41
+ todo = collections.defaultdict(set)
42
+ for line in sets:
43
+ for i in range(1, num_views + 1):
44
+ todo[line[0]].add(line[i])
45
+
46
+ # for each scene, load intrinsics and then parallel crops
47
+ for scene, im_idxs in tqdm(todo.items(), desc="Overall"):
48
+ scene, subscene = scenes[scene].split()
49
+ out_dir = osp.join(output_dir, scene, subscene)
50
+ os.makedirs(out_dir, exist_ok=True)
51
+
52
+ # load all camera params
53
+ _, pose_w2cam, intrinsics = _load_kpts_and_poses(
54
+ db_root, scene, subscene, intrinsics=True
55
+ )
56
+
57
+ in_dir = osp.join(db_root, scene, "dense" + subscene)
58
+ # args = [(in_dir, img, intrinsics[img], pose_w2cam[img], out_dir)
59
+ # for img in [images[im_id] for im_id in im_idxs]]
60
+ args = [
61
+ (in_dir, img, intrinsics[img], pose_w2cam[img], out_dir)
62
+ for img in intrinsics.keys()
63
+ if os.path.exists(osp.join(in_dir, "imgs", img))
64
+ ]
65
+ parallel_threads(
66
+ resize_one_image,
67
+ args,
68
+ star_args=True,
69
+ front_num=0,
70
+ leave=False,
71
+ desc=f"{scene}/{subscene}",
72
+ )
73
+
74
+ # save pairs
75
+ print("Done! prepared all images in", output_dir)
76
+
77
+
78
+ def resize_one_image(root, tag, K_pre_rectif, pose_w2cam, out_dir):
79
+ if osp.isfile(osp.join(out_dir, tag + ".npz")):
80
+ return
81
+
82
+ # load image
83
+ img = cv2.cvtColor(
84
+ cv2.imread(osp.join(root, "imgs", tag), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB
85
+ )
86
+ H, W = img.shape[:2]
87
+
88
+ # load depth
89
+ with h5py.File(osp.join(root, "depths", osp.splitext(tag)[0] + ".h5"), "r") as hd5:
90
+ depthmap = np.asarray(hd5["depth"])
91
+
92
+ # rectify = undistort the intrinsics
93
+ imsize_pre, K_pre, distortion = K_pre_rectif
94
+ imsize_post = img.shape[1::-1]
95
+ K_post = cv2.getOptimalNewCameraMatrix(
96
+ K_pre,
97
+ distortion,
98
+ imsize_pre,
99
+ alpha=0,
100
+ newImgSize=imsize_post,
101
+ centerPrincipalPoint=True,
102
+ )[0]
103
+
104
+ # downscale
105
+ img_out, depthmap_out, intrinsics_out, R_in2out = _downscale_image(
106
+ K_post, img, depthmap, resolution_out=(800, 600)
107
+ )
108
+
109
+ # write everything
110
+ img_out.save(osp.join(out_dir, tag + ".jpg"), quality=90)
111
+ cv2.imwrite(osp.join(out_dir, tag + ".exr"), depthmap_out)
112
+
113
+ camout2world = np.linalg.inv(pose_w2cam)
114
+ camout2world[:3, :3] = camout2world[:3, :3] @ R_in2out.T
115
+ np.savez(
116
+ osp.join(out_dir, tag + ".npz"),
117
+ intrinsics=intrinsics_out,
118
+ cam2world=camout2world,
119
+ )
120
+
121
+
122
+ def _downscale_image(camera_intrinsics, image, depthmap, resolution_out=(512, 384)):
123
+ H, W = image.shape[:2]
124
+ resolution_out = sorted(resolution_out)[:: +1 if W < H else -1]
125
+
126
+ image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
127
+ image, depthmap, camera_intrinsics, resolution_out, force=False
128
+ )
129
+ R_in2out = np.eye(3)
130
+
131
+ return image, depthmap, intrinsics_out, R_in2out
132
+
133
+
134
+ def _load_kpts_and_poses(root, scene_id, subscene, z_only=False, intrinsics=False):
135
+ if intrinsics:
136
+ with open(
137
+ os.path.join(
138
+ root, scene_id, "sparse", "manhattan", subscene, "cameras.txt"
139
+ ),
140
+ "r",
141
+ ) as f:
142
+ raw = f.readlines()[3:] # skip the header
143
+
144
+ camera_intrinsics = {}
145
+ for camera in raw:
146
+ camera = camera.split(" ")
147
+ width, height, focal, cx, cy, k0 = [float(elem) for elem in camera[2:]]
148
+ K = np.eye(3)
149
+ K[0, 0] = focal
150
+ K[1, 1] = focal
151
+ K[0, 2] = cx
152
+ K[1, 2] = cy
153
+ camera_intrinsics[int(camera[0])] = (
154
+ (int(width), int(height)),
155
+ K,
156
+ (k0, 0, 0, 0),
157
+ )
158
+
159
+ with open(
160
+ os.path.join(root, scene_id, "sparse", "manhattan", subscene, "images.txt"), "r"
161
+ ) as f:
162
+ raw = f.read().splitlines()[4:] # skip the header
163
+
164
+ extract_pose = (
165
+ colmap_raw_pose_to_principal_axis if z_only else colmap_raw_pose_to_RT
166
+ )
167
+
168
+ poses = {}
169
+ points3D_idxs = {}
170
+ camera = []
171
+
172
+ for image, points in zip(raw[::2], raw[1::2]):
173
+ image = image.split(" ")
174
+ points = points.split(" ")
175
+
176
+ image_id = image[-1]
177
+ camera.append(int(image[-2]))
178
+
179
+ # find the principal axis
180
+ raw_pose = [float(elem) for elem in image[1:-2]]
181
+ poses[image_id] = extract_pose(raw_pose)
182
+
183
+ current_points3D_idxs = {int(i) for i in points[2::3] if i != "-1"}
184
+ assert -1 not in current_points3D_idxs, bb()
185
+ points3D_idxs[image_id] = current_points3D_idxs
186
+
187
+ if intrinsics:
188
+ image_intrinsics = {
189
+ im_id: camera_intrinsics[cam] for im_id, cam in zip(poses, camera)
190
+ }
191
+ return points3D_idxs, poses, image_intrinsics
192
+ else:
193
+ return points3D_idxs, poses
194
+
195
+
196
+ def colmap_raw_pose_to_principal_axis(image_pose):
197
+ qvec = image_pose[:4]
198
+ qvec = qvec / np.linalg.norm(qvec)
199
+ w, x, y, z = qvec
200
+ z_axis = np.float32(
201
+ [2 * x * z - 2 * y * w, 2 * y * z + 2 * x * w, 1 - 2 * x * x - 2 * y * y]
202
+ )
203
+ return z_axis
204
+
205
+
206
+ def colmap_raw_pose_to_RT(image_pose):
207
+ qvec = image_pose[:4]
208
+ qvec = qvec / np.linalg.norm(qvec)
209
+ w, x, y, z = qvec
210
+ R = np.array(
211
+ [
212
+ [1 - 2 * y * y - 2 * z * z, 2 * x * y - 2 * z * w, 2 * x * z + 2 * y * w],
213
+ [2 * x * y + 2 * z * w, 1 - 2 * x * x - 2 * z * z, 2 * y * z - 2 * x * w],
214
+ [2 * x * z - 2 * y * w, 2 * y * z + 2 * x * w, 1 - 2 * x * x - 2 * y * y],
215
+ ]
216
+ )
217
+ # principal_axis.append(R[2, :])
218
+ t = image_pose[4:7]
219
+ # World-to-Camera pose
220
+ current_pose = np.eye(4)
221
+ current_pose[:3, :3] = R
222
+ current_pose[:3, 3] = t
223
+ return current_pose
224
+
225
+
226
+ if __name__ == "__main__":
227
+ parser = get_parser()
228
+ args = parser.parse_args()
229
+ main(args.megadepth_dir, args.precomputed_sets, args.output_dir, args.num_views)
extern/CUT3R/datasets_preprocess/preprocess_mp3d.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the Matterport3D (MP3D) dataset.
4
+
5
+ This script reads camera parameters and overlap data from a configuration file,
6
+ processes RGB images and corresponding depth images, adjusts camera poses using a
7
+ conversion matrix, and then saves the processed images, depth maps, and camera
8
+ metadata into separate output directories.
9
+
10
+ Usage:
11
+ python preprocess_mp3d.py --root_dir /path/to/data_mp3d/v1/scans \
12
+ --out_dir /path/to/processed_mp3d
13
+ """
14
+
15
+ import os
16
+ import numpy as np
17
+ import cv2
18
+ import shutil
19
+ from concurrent.futures import ProcessPoolExecutor, as_completed
20
+ from tqdm import tqdm
21
+ import argparse
22
+
23
+
24
+ def process_image(args):
25
+ """
26
+ Process a single image: reads the RGB image and depth image, normalizes the depth,
27
+ adjusts the camera pose using a conversion matrix, and saves the processed outputs.
28
+
29
+ Parameters:
30
+ args: tuple containing
31
+ (i, paths, K, pose, img_dir, depth_dir, out_rgb_dir, out_depth_dir, out_cam_dir, R_conv)
32
+ where:
33
+ i - the frame index
34
+ paths - tuple of (depth filename, RGB filename)
35
+ K - camera intrinsics matrix (3x3 NumPy array)
36
+ pose - camera pose (4x4 NumPy array)
37
+ img_dir - directory containing RGB images
38
+ depth_dir - directory containing depth images
39
+ out_rgb_dir - output directory for processed RGB images
40
+ out_depth_dir - output directory for processed depth maps
41
+ out_cam_dir - output directory for processed camera metadata
42
+ R_conv - a 4x4 conversion matrix (NumPy array)
43
+ Returns:
44
+ None if successful, or an error string if processing fails.
45
+ """
46
+ (
47
+ i,
48
+ paths,
49
+ K,
50
+ pose,
51
+ img_dir,
52
+ depth_dir,
53
+ out_rgb_dir,
54
+ out_depth_dir,
55
+ out_cam_dir,
56
+ R_conv,
57
+ ) = args
58
+
59
+ depth_path, img_path = paths
60
+ img_path_full = os.path.join(img_dir, img_path)
61
+ depth_path_full = os.path.join(depth_dir, depth_path)
62
+
63
+ try:
64
+ # Read depth image using OpenCV (assumed to be stored with 16-bit depth)
65
+ depth = cv2.imread(depth_path_full, cv2.IMREAD_ANYDEPTH).astype(np.float32)
66
+ depth = depth / 4000.0 # Normalize depth (adjust this factor as needed)
67
+
68
+ # Adjust the camera pose with the conversion matrix
69
+ pose_adjusted = pose @ R_conv
70
+
71
+ # Generate output filenames using a zero-padded frame index.
72
+ basename = f"{i:06d}"
73
+ out_img_path = os.path.join(out_rgb_dir, basename + ".png")
74
+ out_depth_path = os.path.join(out_depth_dir, basename + ".npy")
75
+ out_cam_path = os.path.join(out_cam_dir, basename + ".npz")
76
+
77
+ # Copy the RGB image.
78
+ shutil.copyfile(img_path_full, out_img_path)
79
+
80
+ # Save the depth map.
81
+ np.save(out_depth_path, depth)
82
+
83
+ # Save the camera intrinsics and adjusted pose.
84
+ np.savez(out_cam_path, intrinsics=K, pose=pose_adjusted)
85
+
86
+ except Exception as e:
87
+ return f"Error processing image {img_path}: {e}"
88
+
89
+ return None
90
+
91
+
92
+ def main():
93
+ parser = argparse.ArgumentParser(
94
+ description="Preprocess MP3D scans: convert and save RGB images, depth maps, and camera metadata."
95
+ )
96
+ parser.add_argument(
97
+ "--root_dir",
98
+ type=str,
99
+ default="/path/to/data_mp3d/v1/scans",
100
+ help="Root directory of the raw MP3D data.",
101
+ )
102
+ parser.add_argument(
103
+ "--out_dir",
104
+ type=str,
105
+ default="/path/to/processed_mp3d",
106
+ help="Output directory for processed MP3D data.",
107
+ )
108
+ args = parser.parse_args()
109
+
110
+ root = args.root_dir
111
+ out_dir = args.out_dir
112
+
113
+ # List sequence directories (each scan is stored as a separate directory).
114
+ seqs = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])
115
+
116
+ # Define a conversion matrix from MP3D to the desired coordinate system.
117
+ R_conv = np.array(
118
+ [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]], dtype=np.float32
119
+ )
120
+
121
+ for seq in tqdm(seqs, desc="Sequences"):
122
+ # The sequence directory structure assumes that images and depth files are stored
123
+ # under a subdirectory with the same name as the sequence.
124
+ seq_dir = os.path.join(root, seq, seq)
125
+
126
+ img_dir = os.path.join(seq_dir, "undistorted_color_images")
127
+ depth_dir = os.path.join(seq_dir, "undistorted_depth_images")
128
+ cam_file = os.path.join(seq_dir, "undistorted_camera_parameters", f"{seq}.conf")
129
+ overlap_file = os.path.join(seq_dir, "image_overlap_data", f"{seq}_iis.txt")
130
+
131
+ # Read overlap data and save it (optional).
132
+ overlap = []
133
+ with open(overlap_file, "r") as f:
134
+ for line in f:
135
+ parts = line.split()
136
+ overlap.append([int(parts[1]), int(parts[2]), float(parts[3])])
137
+ overlap = np.array(overlap)
138
+ os.makedirs(os.path.join(out_dir, seq), exist_ok=True)
139
+ np.save(os.path.join(out_dir, seq, "overlap.npy"), overlap)
140
+
141
+ # Read camera parameters from a configuration file.
142
+ intrinsics = []
143
+ camera_poses = []
144
+ image_files = []
145
+
146
+ with open(cam_file, "r") as file:
147
+ lines = file.readlines()
148
+ current_intrinsics = None
149
+ for line in lines:
150
+ parts = line.split()
151
+ if not parts:
152
+ continue
153
+ if parts[0] == "intrinsics_matrix":
154
+ # Extract intrinsic parameters.
155
+ fx, cx, fy, cy = (
156
+ float(parts[1]),
157
+ float(parts[3]),
158
+ float(parts[5]),
159
+ float(parts[6]),
160
+ )
161
+ current_intrinsics = np.array(
162
+ [[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32
163
+ )
164
+ elif parts[0] == "scan":
165
+ # Read the image filenames and camera pose.
166
+ depth_image = parts[1]
167
+ color_image = parts[2]
168
+ image_files.append((depth_image, color_image))
169
+ matrix_values = list(map(float, parts[3:]))
170
+ camera_pose = np.array(matrix_values).reshape(4, 4)
171
+ camera_poses.append(camera_pose)
172
+ if current_intrinsics is not None:
173
+ intrinsics.append(current_intrinsics.copy())
174
+
175
+ if not (len(image_files) == len(intrinsics) == len(camera_poses)):
176
+ print(f"Inconsistent data in sequence {seq}")
177
+ continue
178
+
179
+ # Prepare output directories.
180
+ out_rgb_dir = os.path.join(out_dir, seq, "rgb")
181
+ out_depth_dir = os.path.join(out_dir, seq, "depth")
182
+ out_cam_dir = os.path.join(out_dir, seq, "cam")
183
+ os.makedirs(out_rgb_dir, exist_ok=True)
184
+ os.makedirs(out_depth_dir, exist_ok=True)
185
+ os.makedirs(out_cam_dir, exist_ok=True)
186
+
187
+ tasks = []
188
+ for i, (paths, K, pose) in enumerate(
189
+ zip(image_files, intrinsics, camera_poses)
190
+ ):
191
+ args_task = (
192
+ i,
193
+ paths,
194
+ K,
195
+ pose,
196
+ img_dir,
197
+ depth_dir,
198
+ out_rgb_dir,
199
+ out_depth_dir,
200
+ out_cam_dir,
201
+ R_conv,
202
+ )
203
+ tasks.append(args_task)
204
+
205
+ num_workers = os.cpu_count() // 2
206
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
207
+ futures = {executor.submit(process_image, task): task[0] for task in tasks}
208
+ for future in tqdm(
209
+ as_completed(futures), total=len(futures), desc=f"Processing {seq}"
210
+ ):
211
+ error = future.result()
212
+ if error:
213
+ print(error)
214
+
215
+
216
+ if __name__ == "__main__":
217
+ main()
extern/CUT3R/datasets_preprocess/preprocess_mvimgnet.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the MVImgNet dataset.
4
+
5
+ This script processes MVImgNet sequences by:
6
+ - Loading a sparse SFM reconstruction.
7
+ - Undistorting and rescaling RGB images.
8
+ - Converting COLMAP intrinsics between conventions.
9
+ - Saving the processed images and camera metadata.
10
+
11
+ Usage:
12
+ python preprocess_mvimgnet.py --data_dir /path/to/MVImgNet_data \
13
+ --pcd_dir /path/to/MVPNet \
14
+ --output_dir /path/to/processed_mvimgnet
15
+ """
16
+
17
+ import os
18
+ import os.path as osp
19
+ import argparse
20
+ import numpy as np
21
+ import open3d as o3d
22
+ import pyrender
23
+ import PIL.Image as Image
24
+ import cv2
25
+ import shutil
26
+ from tqdm import tqdm
27
+ import matplotlib.pyplot as plt
28
+
29
+ # Import your custom SFM processing function.
30
+ from read_write_model import run # Assumed to be available
31
+
32
+ # Try to set up resampling filters from PIL.
33
+ try:
34
+ lanczos = Image.Resampling.LANCZOS
35
+ bicubic = Image.Resampling.BICUBIC
36
+ except AttributeError:
37
+ lanczos = Image.LANCZOS
38
+ bicubic = Image.BICUBIC
39
+
40
+ # Conversion matrix from COLMAP (or OpenGL) to OpenCV conventions.
41
+ OPENGL_TO_OPENCV = np.float32(
42
+ [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]
43
+ )
44
+
45
+
46
+ # -----------------------------------------------------------------------------
47
+ # Helper Classes and Functions
48
+ # -----------------------------------------------------------------------------
49
+ class ImageList:
50
+ """Convenience class to apply operations to a list of images."""
51
+
52
+ def __init__(self, images):
53
+ if not isinstance(images, (list, tuple)):
54
+ images = [images]
55
+ self.images = []
56
+ for image in images:
57
+ if not isinstance(image, Image.Image):
58
+ image = Image.fromarray(image)
59
+ self.images.append(image)
60
+
61
+ def __len__(self):
62
+ return len(self.images)
63
+
64
+ def to_pil(self):
65
+ return tuple(self.images) if len(self.images) > 1 else self.images[0]
66
+
67
+ @property
68
+ def size(self):
69
+ sizes = [im.size for im in self.images]
70
+ assert all(s == sizes[0] for s in sizes)
71
+ return sizes[0]
72
+
73
+ def resize(self, *args, **kwargs):
74
+ return ImageList([im.resize(*args, **kwargs) for im in self.images])
75
+
76
+ def crop(self, *args, **kwargs):
77
+ return ImageList([im.crop(*args, **kwargs) for im in self.images])
78
+
79
+
80
+ def colmap_to_opencv_intrinsics(K):
81
+ """
82
+ Convert COLMAP intrinsics (with pixel centers at (0.5, 0.5)) to OpenCV convention.
83
+ """
84
+ K = K.copy()
85
+ K[0, 2] -= 0.5
86
+ K[1, 2] -= 0.5
87
+ return K
88
+
89
+
90
+ def opencv_to_colmap_intrinsics(K):
91
+ """
92
+ Convert OpenCV intrinsics (with pixel centers at (0, 0)) to COLMAP convention.
93
+ """
94
+ K = K.copy()
95
+ K[0, 2] += 0.5
96
+ K[1, 2] += 0.5
97
+ return K
98
+
99
+
100
+ def rescale_image_depthmap(
101
+ image, depthmap, camera_intrinsics, output_resolution, force=True
102
+ ):
103
+ """
104
+ Jointly rescale an image (and its depthmap) so that the output resolution is at least the desired value.
105
+
106
+ Args:
107
+ image: Input image (as a PIL.Image or compatible object).
108
+ depthmap: A corresponding depth map (or None).
109
+ camera_intrinsics: A 3x3 NumPy array of intrinsics.
110
+ output_resolution: (width, height) desired resolution.
111
+ force: If True, always rescale even if the image is smaller.
112
+
113
+ Returns:
114
+ Tuple of (rescaled image, rescaled depthmap, updated intrinsics).
115
+ """
116
+ image = ImageList(image)
117
+ input_resolution = np.array(image.size) # (W, H)
118
+ output_resolution = np.array(output_resolution)
119
+ if depthmap is not None:
120
+ assert tuple(depthmap.shape[:2]) == image.size[::-1]
121
+ scale_final = max(output_resolution / image.size) + 1e-8
122
+ if scale_final >= 1 and not force:
123
+ return image.to_pil(), depthmap, camera_intrinsics
124
+ output_resolution = np.floor(input_resolution * scale_final).astype(int)
125
+ image = image.resize(
126
+ tuple(output_resolution), resample=lanczos if scale_final < 1 else bicubic
127
+ )
128
+ if depthmap is not None:
129
+ depthmap = cv2.resize(
130
+ depthmap, tuple(output_resolution), interpolation=cv2.INTER_NEAREST
131
+ )
132
+ camera_intrinsics = camera_matrix_of_crop(
133
+ camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
134
+ )
135
+ return image.to_pil(), depthmap, camera_intrinsics
136
+
137
+
138
+ def camera_matrix_of_crop(
139
+ input_camera_matrix,
140
+ input_resolution,
141
+ output_resolution,
142
+ scaling=1,
143
+ offset_factor=0.5,
144
+ offset=None,
145
+ ):
146
+ """
147
+ Update the camera intrinsics to account for a rescaling (or cropping) of the image.
148
+ """
149
+ margins = np.asarray(input_resolution) * scaling - output_resolution
150
+ assert np.all(margins >= 0.0)
151
+ if offset is None:
152
+ offset = offset_factor * margins
153
+ output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
154
+ output_camera_matrix_colmap[:2, :] *= scaling
155
+ output_camera_matrix_colmap[:2, 2] -= offset
156
+ output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
157
+ return output_camera_matrix
158
+
159
+
160
+ def pose_from_qwxyz_txyz(elems):
161
+ """
162
+ Convert a quaternion (qw, qx, qy, qz) and translation (tx, ty, tz) to a 4x4 pose.
163
+ Returns the inverse of the computed pose (i.e. cam2world).
164
+ """
165
+ from scipy.spatial.transform import Rotation
166
+
167
+ qw, qx, qy, qz, tx, ty, tz = map(float, elems)
168
+ pose = np.eye(4)
169
+ pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix()
170
+ pose[:3, 3] = (tx, ty, tz)
171
+ return np.linalg.inv(pose)
172
+
173
+
174
+ def load_sfm(sfm_dir):
175
+ """
176
+ Load sparse SFM data from COLMAP output files.
177
+
178
+ Returns a tuple (img_idx, img_infos) where:
179
+ - img_idx: A dict mapping image filename to index.
180
+ - img_infos: A dict of image information (including intrinsics, file path, and camera pose).
181
+ """
182
+ with open(osp.join(sfm_dir, "cameras.txt"), "r") as f:
183
+ raw = f.read().splitlines()[3:] # skip header
184
+ intrinsics = {}
185
+ for camera in raw:
186
+ camera = camera.split(" ")
187
+ intrinsics[int(camera[0])] = [camera[1]] + [float(x) for x in camera[2:]]
188
+ with open(osp.join(sfm_dir, "images.txt"), "r") as f:
189
+ raw = f.read().splitlines()
190
+ raw = [line for line in raw if not line.startswith("#")]
191
+ img_idx = {}
192
+ img_infos = {}
193
+ for image, points in zip(raw[0::2], raw[1::2]):
194
+ image = image.split(" ")
195
+ points = points.split(" ")
196
+ idx = image[0]
197
+ img_name = image[-1]
198
+ assert img_name not in img_idx, f"Duplicate image: {img_name}"
199
+ img_idx[img_name] = idx
200
+ current_points2D = {
201
+ int(i): (float(x), float(y))
202
+ for i, x, y in zip(points[2::3], points[0::3], points[1::3])
203
+ if i != "-1"
204
+ }
205
+ img_infos[idx] = dict(
206
+ intrinsics=intrinsics[int(image[-2])],
207
+ path=img_name,
208
+ frame_id=img_name,
209
+ cam_to_world=pose_from_qwxyz_txyz(image[1:-2]),
210
+ sparse_pts2d=current_points2D,
211
+ )
212
+ return img_idx, img_infos
213
+
214
+
215
+ def undistort_images(intrinsics, rgb):
216
+ """
217
+ Given camera intrinsics (in COLMAP convention) and an RGB image, compute and return
218
+ the corresponding OpenCV intrinsics along with the (unchanged) image.
219
+ """
220
+ width = int(intrinsics[1])
221
+ height = int(intrinsics[2])
222
+ fx = intrinsics[3]
223
+ fy = intrinsics[4]
224
+ cx = intrinsics[5]
225
+ cy = intrinsics[6]
226
+ K = np.zeros([3, 3])
227
+ K[0, 0] = fx
228
+ K[0, 2] = cx
229
+ K[1, 1] = fy
230
+ K[1, 2] = cy
231
+ K[2, 2] = 1
232
+ return width, height, K, rgb
233
+
234
+
235
+ # -----------------------------------------------------------------------------
236
+ # Processing Functions
237
+ # -----------------------------------------------------------------------------
238
+ def process_sequence(category, obj, data_dir, output_dir):
239
+ """
240
+ Process a single sequence from MVImgNet.
241
+
242
+ Steps:
243
+ 1. Load the point cloud (from the MVPNet directory) and create a mesh (using Pyrender) for visualization.
244
+ 2. Load the SFM reconstruction from COLMAP files.
245
+ 3. For each image in the SFM output:
246
+ a. Load the image.
247
+ b. Undistort and rescale it.
248
+ c. Update the camera intrinsics.
249
+ d. Save the processed image and camera metadata.
250
+ """
251
+
252
+ # Define directories.
253
+ seq_dir = osp.join(data_dir, "MVImgNet_by_categories", category, obj[:-4])
254
+ rgb_dir = osp.join(seq_dir, "images")
255
+ sfm_dir = osp.join(seq_dir, "sparse", "0")
256
+
257
+ output_scene_dir = osp.join(output_dir, f"{category}_{obj[:-4]}")
258
+ output_rgb_dir = osp.join(output_scene_dir, "rgb")
259
+ output_cam_dir = osp.join(output_scene_dir, "cam")
260
+ os.makedirs(output_rgb_dir, exist_ok=True)
261
+ os.makedirs(output_cam_dir, exist_ok=True)
262
+
263
+ # Run custom SFM processing.
264
+ run(sfm_dir, sfm_dir)
265
+ img_idx, img_infos = load_sfm(sfm_dir)
266
+
267
+ for imgname in img_idx:
268
+ idx = img_idx[imgname]
269
+ info = img_infos[idx]
270
+ rgb_path = osp.join(rgb_dir, info["path"])
271
+ if not osp.exists(rgb_path):
272
+ continue
273
+ rgb = np.array(Image.open(rgb_path))
274
+ _, _, K, rgb = undistort_images(info["intrinsics"], rgb)
275
+ intrinsics = colmap_to_opencv_intrinsics(K)
276
+ # Rescale image to a target resolution (e.g., 640x480) preserving aspect ratio.
277
+ image, _, intrinsics = rescale_image_depthmap(
278
+ rgb, None, intrinsics, (640, int(640 * 3.0 / 4))
279
+ )
280
+ intrinsics = opencv_to_colmap_intrinsics(intrinsics)
281
+ out_img_path = osp.join(output_rgb_dir, info["path"][:-3] + "jpg")
282
+ image.save(out_img_path)
283
+ out_cam_path = osp.join(output_cam_dir, info["path"][:-3] + "npz")
284
+ np.savez(out_cam_path, intrinsics=intrinsics, pose=info["cam_to_world"])
285
+
286
+
287
+ def main():
288
+ parser = argparse.ArgumentParser(
289
+ description="Preprocess MVImgNet dataset: undistort, rescale images, and save camera parameters."
290
+ )
291
+ parser.add_argument(
292
+ "--data_dir",
293
+ type=str,
294
+ default="/path/to/MVImgNet_data",
295
+ help="Directory containing MVImgNet data (images and point clouds).",
296
+ )
297
+ parser.add_argument(
298
+ "--output_dir",
299
+ type=str,
300
+ default="/path/to/processed_mvimgnet",
301
+ help="Directory where processed data will be saved.",
302
+ )
303
+ args = parser.parse_args()
304
+
305
+ data_dir = args.data_dir
306
+ output_dir = args.output_dir
307
+
308
+ # Get list of categories.
309
+ categories = sorted(
310
+ [
311
+ d
312
+ for d in os.listdir(osp.join(data_dir, "MVImgNet_by_categories"))
313
+ if osp.isdir(osp.join(data_dir, "MVImgNet_by_categories", d))
314
+ ]
315
+ )
316
+ for cat in categories:
317
+ objects = sorted(os.listdir(osp.join(data_dir, "MVImgNet_by_categories", cat)))
318
+ for obj in objects:
319
+ process_sequence(cat, obj, data_dir, output_dir)
320
+
321
+
322
+ if __name__ == "__main__":
323
+ main()
extern/CUT3R/datasets_preprocess/preprocess_mvs_synth.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Preprocess the MVS Synth dataset.
4
+
5
+ This script processes each sequence in a given dataset directory by:
6
+ - Reading the RGB image, EXR depth image, and JSON camera parameters.
7
+ - Computing the camera pose from the extrinsic matrix (with a conversion matrix applied).
8
+ - Creating a simple camera intrinsics matrix from the provided focal lengths and principal point.
9
+ - Copying the RGB image (as JPG), saving the depth (as a NumPy array), and saving the camera data (as a NPZ file).
10
+
11
+ Usage:
12
+ python preprocess_mvs_synth.py --root_dir /path/to/data_mvs_synth/GTAV_720/ \
13
+ --out_dir /path/to/processed_mvs_synth \
14
+ --num_workers 32
15
+ """
16
+
17
+ import os
18
+ import shutil
19
+ import json
20
+ from concurrent.futures import ProcessPoolExecutor, as_completed
21
+ from tqdm import tqdm
22
+ import numpy as np
23
+ import cv2
24
+ import argparse
25
+
26
+ # Ensure OpenEXR support if needed
27
+ os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
28
+
29
+ # Conversion matrix (example conversion, adjust if needed)
30
+ R_conv = np.array(
31
+ [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32
32
+ )
33
+
34
+
35
+ def process_basename(seq, basename, root_dir, out_dir):
36
+ """
37
+ Process a single frame identified by 'basename' within a given sequence.
38
+
39
+ Reads the RGB image, depth (EXR) file, and camera parameters (JSON file),
40
+ computes the adjusted camera pose, builds the camera intrinsics matrix,
41
+ and saves the processed outputs.
42
+
43
+ Parameters:
44
+ seq (str): The sequence (subdirectory) name.
45
+ basename (str): The basename of the file (without extension).
46
+ root_dir (str): Root directory containing the raw data.
47
+ out_dir (str): Output directory where processed data will be saved.
48
+
49
+ Returns:
50
+ None on success, or an error string on failure.
51
+ """
52
+ try:
53
+ # Define input directories.
54
+ seq_dir = os.path.join(root_dir, seq)
55
+ img_dir = os.path.join(seq_dir, "images")
56
+ depth_dir = os.path.join(seq_dir, "depths")
57
+ cam_dir = os.path.join(seq_dir, "poses")
58
+
59
+ # Define input file paths.
60
+ img_path = os.path.join(img_dir, basename + ".png")
61
+ depth_path = os.path.join(depth_dir, basename + ".exr")
62
+ cam_path = os.path.join(cam_dir, basename + ".json")
63
+
64
+ # Define output directories.
65
+ out_seq_dir = os.path.join(out_dir, seq)
66
+ out_img_dir = os.path.join(out_seq_dir, "rgb")
67
+ out_depth_dir = os.path.join(out_seq_dir, "depth")
68
+ out_cam_dir = os.path.join(out_seq_dir, "cam")
69
+ os.makedirs(out_img_dir, exist_ok=True)
70
+ os.makedirs(out_depth_dir, exist_ok=True)
71
+ os.makedirs(out_cam_dir, exist_ok=True)
72
+
73
+ # Define output file paths.
74
+ out_img_path = os.path.join(out_img_dir, basename + ".jpg")
75
+ out_depth_path = os.path.join(out_depth_dir, basename + ".npy")
76
+ out_cam_path = os.path.join(out_cam_dir, basename + ".npz")
77
+
78
+ # Read and process camera parameters.
79
+ with open(cam_path, "r") as f:
80
+ cam_data = json.load(f)
81
+ c_x = cam_data["c_x"]
82
+ c_y = cam_data["c_y"]
83
+ f_x = cam_data["f_x"]
84
+ f_y = cam_data["f_y"]
85
+ extrinsic = np.array(cam_data["extrinsic"])
86
+ # Invert extrinsic matrix to obtain camera-to-world pose.
87
+ pose = np.linalg.inv(extrinsic)
88
+ # Apply conversion matrix.
89
+ pose = R_conv @ pose
90
+
91
+ # Build a simple intrinsics matrix.
92
+ intrinsics = np.array(
93
+ [[f_x, 0, c_x], [0, f_y, c_y], [0, 0, 1]], dtype=np.float32
94
+ )
95
+
96
+ if np.any(np.isinf(pose)) or np.any(np.isnan(pose)):
97
+ raise ValueError(f"Invalid pose for {basename}")
98
+
99
+ # Read depth image.
100
+ depth = cv2.imread(depth_path, cv2.IMREAD_ANYDEPTH).astype(np.float32)
101
+ depth[np.isinf(depth)] = 0.0 # Clean up any infinite values
102
+
103
+ # Save the processed data.
104
+ shutil.copyfile(img_path, out_img_path)
105
+ np.save(out_depth_path, depth)
106
+ np.savez(out_cam_path, intrinsics=intrinsics, pose=pose)
107
+
108
+ except Exception as e:
109
+ return f"Error processing {seq}/{basename}: {e}"
110
+
111
+ return None
112
+
113
+
114
+ def main():
115
+ parser = argparse.ArgumentParser(
116
+ description="Preprocess MVS Synth dataset: convert images, depth, and camera data."
117
+ )
118
+ parser.add_argument(
119
+ "--root_dir",
120
+ type=str,
121
+ default="/path/to/data_mvs_synth/GTAV_720/",
122
+ help="Root directory of the raw MVS Synth data.",
123
+ )
124
+ parser.add_argument(
125
+ "--out_dir",
126
+ type=str,
127
+ default="/path/to/processed_mvs_synth",
128
+ help="Output directory for processed data.",
129
+ )
130
+ parser.add_argument(
131
+ "--num_workers", type=int, default=32, help="Number of parallel workers."
132
+ )
133
+ args = parser.parse_args()
134
+
135
+ root_dir = args.root_dir
136
+ out_dir = args.out_dir
137
+
138
+ # Get list of sequence directories.
139
+ seqs = sorted(
140
+ [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
141
+ )
142
+
143
+ # Pre-create output directories for each sequence.
144
+ for seq in seqs:
145
+ out_seq_dir = os.path.join(out_dir, seq)
146
+ os.makedirs(os.path.join(out_seq_dir, "rgb"), exist_ok=True)
147
+ os.makedirs(os.path.join(out_seq_dir, "depth"), exist_ok=True)
148
+ os.makedirs(os.path.join(out_seq_dir, "cam"), exist_ok=True)
149
+
150
+ # Build list of processing tasks.
151
+ tasks = []
152
+ for seq in seqs:
153
+ seq_dir = os.path.join(root_dir, seq)
154
+ img_dir = os.path.join(seq_dir, "images")
155
+ basenames = sorted([d[:-4] for d in os.listdir(img_dir) if d.endswith(".png")])
156
+ for basename in basenames:
157
+ tasks.append((seq, basename, root_dir, out_dir))
158
+
159
+ num_workers = args.num_workers
160
+ print(f"Processing {len(tasks)} tasks using {num_workers} workers...")
161
+
162
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
163
+ futures = {executor.submit(process_basename, *task): task[1] for task in tasks}
164
+ for future in tqdm(
165
+ as_completed(futures), total=len(futures), desc="Processing"
166
+ ):
167
+ error = future.result()
168
+ if error:
169
+ print(error)
170
+
171
+
172
+ if __name__ == "__main__":
173
+ main()